Skip to content

Commit 1d63200

Browse files
committed
Extract list parameters in parse_form_data
When the client prepares the request, it turns list parameters into multiple instances of the same key. For instance: `extract_image_block_types=["Image", "Table"]` becomes `extract_image_block_types[]="Image"` `extract_image_block_types[]="Table"` We need to account for this in our `parse_form_data` helper if we want to use list params in our hooks. Likewise, we need to go the other way when recreating the request in `create_request_body`.
1 parent 42a9a24 commit 1d63200

File tree

3 files changed

+74
-27
lines changed

3 files changed

+74
-27
lines changed

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,8 @@ def test_unit_create_response():
122122

123123

124124
def test_unit_create_request():
125-
"""Test create request method properly sets file, Content-Type and Content-Length headers."""
125+
"""Test create request method properly sets file, Content-Type and Content-Length headers.
126+
List parameters should be flattened in the body."""
126127

127128
# Prepare test data
128129
request = requests.PreparedRequest()
@@ -133,27 +134,27 @@ def test_unit_create_request():
133134
form_data = {
134135
"parameter_1": "value_1",
135136
"parameter_2": "value_2",
137+
"list_parameter": ["value_1", "value_2"],
136138
}
137139
page = (io.BytesIO(b"page_content"), 1)
138140
filename = "test_file.pdf"
139141

140142
# Expected results
141-
expected_payload = {
142-
"parameter_1": "value_1",
143-
"parameter_2": "value_2",
144-
"split_pdf_page": "false",
145-
"starting_page_number": "7",
146-
}
147143
expected_page_filename = "test_file.pdf"
148144
expected_body = MultipartEncoder(
149-
fields={
150-
**expected_payload,
151-
"files": (
145+
fields=[
146+
("parameter_1", "value_1"),
147+
("parameter_2", "value_2"),
148+
("list_parameter", "value_1"),
149+
("list_parameter", "value_2"),
150+
("split_pdf_page", "false"),
151+
("starting_page_number", "7"),
152+
("files", (
152153
expected_page_filename,
153154
page[0],
154155
"application/pdf",
155-
),
156-
}
156+
)),
157+
]
157158
)
158159
expected_url = ""
159160

@@ -164,7 +165,10 @@ def test_unit_create_request():
164165
# Assert the request object
165166
assert request_obj.method == "POST"
166167
assert request_obj.url == expected_url
167-
assert request_obj.data.fields == expected_body.fields
168+
169+
# Validate fields ignoring order
170+
assert set(request_obj.data.fields) == set(expected_body.fields)
171+
168172
assert request_content_type.startswith("multipart/form-data")
169173

170174

@@ -191,11 +195,37 @@ def test_unit_decode_content_disposition():
191195

192196

193197
def test_unit_parse_form_data():
194-
"""Test parse form data method properly parses the form data and returns dictionary."""
198+
"""Test parse form data method properly parses the form data and returns dictionary.
199+
Parameters with the same key should be consolidated to a list."""
195200

196201
# Prepare test data
202+
test_form_data = (
203+
b"--boundary\r\n"
204+
b"Content-Disposition: form-data; name=\"files\"; filename=\"test_file.pdf\"\r\n"
205+
b"\r\n"
206+
b"file_content\r\n"
207+
b"--boundary\r\n"
208+
b"Content-Disposition: form-data; name=\"parameter_1\"\r\n"
209+
b"\r\n"
210+
b"value_1\r\n"
211+
b"--boundary\r\n"
212+
b"Content-Disposition: form-data; name=\"parameter_2\"\r\n"
213+
b"\r\n"
214+
b"value_2\r\n"
215+
b"--boundary\r\n"
216+
b"Content-Disposition: form-data; name=\"list_parameter\"\r\n"
217+
b"\r\n"
218+
b"value_1\r\n"
219+
b"--boundary\r\n"
220+
b"Content-Disposition: form-data; name=\"list_parameter\"\r\n"
221+
b"\r\n"
222+
b"value_2\r\n"
223+
b"--boundary--\r\n"
224+
)
225+
226+
197227
decoded_data = MultipartDecoder(
198-
b'--boundary\r\nContent-Disposition: form-data; name="files"; filename="test_file.pdf"\r\n\r\nfile_content\r\n--boundary\r\nContent-Disposition: form-data; name="parameter_1"\r\n\r\nvalue_1\r\n--boundary\r\nContent-Disposition: form-data; name="parameter_2"\r\n\r\nvalue_2\r\n--boundary--\r\n',
228+
test_form_data,
199229
"multipart/form-data; boundary=boundary",
200230
)
201231

@@ -204,6 +234,7 @@ def test_unit_parse_form_data():
204234
"files": shared.Files(b"file_content", "test_file.pdf"),
205235
"parameter_1": "value_1",
206236
"parameter_2": "value_2",
237+
"list_parameter": ["value_1", "value_2"],
207238
}
208239

209240
# Parse form data
@@ -212,6 +243,7 @@ def test_unit_parse_form_data():
212243
# Assert the parsed form data
213244
assert form_data.get("parameter_1") == expected_form_data.get("parameter_1")
214245
assert form_data.get("parameter_2") == expected_form_data.get("parameter_2")
246+
assert form_data.get("list_parameter") == expected_form_data.get("list_parameter")
215247
assert form_data.get("files").file_name == expected_form_data.get("files").file_name
216248

217249
assert form_data.get("files").content == expected_form_data.get("files").content

src/unstructured_client/_hooks/custom/form_utils.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from unstructured_client.models import shared
1010

1111
logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)
12-
FormData = dict[str, Union[str, shared.Files]]
12+
FormData = dict[str, Union[str, shared.Files, list[str]]]
1313

1414
PARTITION_FORM_FILES_KEY = "files"
1515
PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page"
@@ -148,6 +148,13 @@ def parse_form_data(decoded_data: MultipartDecoder) -> FormData:
148148
raise ValueError("Filename can't be an empty string.")
149149
form_data[PARTITION_FORM_FILES_KEY] = shared.Files(part.content, filename)
150150
else:
151-
form_data[name] = part.content.decode()
151+
content = part.content.decode()
152+
if name in form_data:
153+
if isinstance(form_data[name], list):
154+
form_data[name].append(content)
155+
else:
156+
form_data[name] = [form_data[name], content]
157+
else:
158+
form_data[name] = content
152159

153160
return form_data

src/unstructured_client/_hooks/custom/request_utils.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import io
66
import json
77
import logging
8-
from typing import Optional, Tuple
8+
from typing import Optional, Tuple, Any
99

1010
import httpx
1111
import requests
@@ -27,16 +27,24 @@ def create_request_body(
2727
form_data: FormData, page_content: io.BytesIO, filename: str, page_number: int
2828
) -> MultipartEncoder:
2929
payload = prepare_request_payload(form_data)
30+
31+
payload_fields: list[tuple[str, Any]] = []
32+
for key, value in payload.items():
33+
if isinstance(value, list):
34+
payload_fields.extend([(key, list_value) for list_value in value])
35+
else:
36+
payload_fields.append((key, value))
37+
38+
payload_fields.append((PARTITION_FORM_FILES_KEY, (
39+
filename,
40+
page_content,
41+
"application/pdf",
42+
)))
43+
44+
payload_fields.append((PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, str(page_number)))
45+
3046
body = MultipartEncoder(
31-
fields={
32-
**payload,
33-
PARTITION_FORM_FILES_KEY: (
34-
filename,
35-
page_content,
36-
"application/pdf",
37-
),
38-
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY: str(page_number),
39-
}
47+
fields=payload_fields
4048
)
4149
return body
4250

0 commit comments

Comments
 (0)