Skip to content

Commit 0cadf93

Browse files
feat: add unique_element_ids param (Unstructured-IO#399)
unique_element_ids added as API parameter, and passed into the library.
1 parent a330ea4 commit 0cadf93

File tree

9 files changed

+111
-6
lines changed

9 files changed

+111
-6
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.66-dev0
2+
3+
* Add support for `unique_element_ids` parameter.
4+
15
## 0.0.65
26

37
* Bump unstructured to 0.12.4

README.md

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ various heuristics to detect the filetypes after uncompressing from .gz.
208208
When processing XML documents, set the `xml_keep_tags` parameter to `true` to retain the XML tags in the output. If not specified, it will simply extract the text from within the tags.
209209

210210
```
211-
curl -X 'POST'
211+
curl -X 'POST' \
212212
'https://api.unstructured.io/general/v0/general' \
213213
-H 'accept: application/json' \
214214
-H 'Content-Type: multipart/form-data' \
@@ -222,7 +222,7 @@ curl -X 'POST'
222222
For supported filetypes, set the `include_page_breaks` parameter to `true` to include `PageBreak` elements in the output.
223223

224224
```
225-
curl -X 'POST'
225+
curl -X 'POST' \
226226
'https://api.unstructured.io/general/v0/general' \
227227
-H 'accept: application/json' \
228228
-H 'Content-Type: multipart/form-data' \
@@ -232,6 +232,26 @@ curl -X 'POST'
232232
```
233233

234234

235+
#### Unique element IDs
236+
237+
By default, the element ID is a SHA-256 hash of the element text. This is to ensure that
238+
the ID is deterministic. One downside is that the ID is not guaranteed to be unique.
239+
Different elements with the same text will have the same ID, and there could also be hash collisions.
240+
To use UUIDs in the output instead, set ``unique_element_ids=true``. Note: this means that the element IDs
241+
will be random, so with every partition of the same file, you will get different IDs.
242+
This can be helpful if you'd like to use the IDs as a primary key in a database, for example.
243+
244+
```
245+
curl -X 'POST' \
246+
'https://api.unstructured.io/general/v0/general' \
247+
-H 'accept: application/json' \
248+
-H 'Content-Type: multipart/form-data' \
249+
-F 'files=@sample-docs/layout-parser-paper-fast.pdf' \
250+
-F 'unique_element_ids=true' \
251+
| jq -C . | less -R
252+
```
253+
254+
235255
#### Chunking Elements
236256

237257
Use the `chunking_strategy` form-field to chunk text into larger or smaller elements. Defaults to `None` which performs no chunking. The available chunking strategies are `basic` and `by_title`.

openapi.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,11 @@
240240
"default": [],
241241
"description": "The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields",
242242
"example": ["image", "table"]
243+
},
244+
"unique_element_ids": {
245+
"type": "boolean",
246+
"title": "Unique element IDs",
247+
"description": "When True, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False"
243248
}
244249
},
245250
"type": "object",

prepline_general/api/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
app = FastAPI(
1313
title="Unstructured Pipeline API",
1414
summary="Partition documents with the Unstructured library",
15-
version="0.0.65",
15+
version="0.0.66",
1616
docs_url="/general/docs",
1717
openapi_url="/general/openapi.json",
1818
servers=[

prepline_general/api/general.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ def pipeline_api(
299299
xml_keep_tags: bool = False,
300300
languages: Optional[List[str]] = None,
301301
extract_image_block_types: Optional[List[str]] = None,
302+
unique_element_ids: Optional[bool] = False,
302303
) -> List[Dict[str, Any]] | str:
303304
if filename.endswith(".msg"):
304305
# Note(yuming): convert file type for msg files
@@ -333,6 +334,7 @@ def pipeline_api(
333334
"xml_keep_tags": xml_keep_tags,
334335
"languages": languages,
335336
"extract_image_block_types": extract_image_block_types,
337+
"unique_element_ids": unique_element_ids,
336338
"chunking_strategy": chunking_strategy,
337339
"combine_under_n_chars": combine_under_n_chars,
338340
"max_characters": max_characters,
@@ -390,6 +392,7 @@ def pipeline_api(
390392
"overlap_all": overlap_all,
391393
"extract_image_block_types": extract_image_block_types,
392394
"extract_image_block_to_payload": extract_image_block_to_payload,
395+
"unique_element_ids": unique_element_ids,
393396
},
394397
default=str,
395398
)
@@ -418,6 +421,7 @@ def pipeline_api(
418421
"overlap_all": overlap_all,
419422
"extract_image_block_types": extract_image_block_types,
420423
"extract_image_block_to_payload": extract_image_block_to_payload,
424+
"unique_element_ids": unique_element_ids,
421425
}
422426

423427
if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
@@ -692,7 +696,7 @@ def return_content_type(filename: str):
692696

693697

694698
@router.get("/general/v0/general", include_in_schema=False)
695-
@router.get("/general/v0.0.65/general", include_in_schema=False)
699+
@router.get("/general/v0.0.66/general", include_in_schema=False)
696700
async def handle_invalid_get_request():
697701
raise HTTPException(
698702
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -707,7 +711,7 @@ async def handle_invalid_get_request():
707711
description="Description",
708712
operation_id="partition_parameters",
709713
)
710-
@router.post("/general/v0.0.65/general", include_in_schema=False)
714+
@router.post("/general/v0.0.66/general", include_in_schema=False)
711715
def general_partition(
712716
request: Request,
713717
# cannot use annotated type here because of a bug described here:
@@ -778,6 +782,7 @@ def response_generator(is_multipart: bool):
778782
file_content_type=file_content_type,
779783
languages=form_params.languages,
780784
extract_image_block_types=form_params.extract_image_block_types,
785+
unique_element_ids=form_params.unique_element_ids,
781786
# -- chunking options --
782787
chunking_strategy=chunking_strategy,
783788
combine_under_n_chars=form_params.combine_under_n_chars,

prepline_general/api/models/form_params.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class GeneralFormParams(BaseModel):
2626
pdf_infer_table_structure: bool
2727
strategy: str
2828
extract_image_block_types: Optional[List[str]]
29+
unique_element_ids: bool
30+
# -- chunking options --
2931
chunking_strategy: Optional[str]
3032
combine_under_n_chars: Optional[int]
3133
max_characters: int
@@ -152,6 +154,15 @@ def as_form(
152154
),
153155
BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
154156
] = [], # noqa
157+
unique_element_ids: Annotated[
158+
bool,
159+
Form(
160+
title="unique_element_ids",
161+
description="""When `True`, assign UUIDs to element IDs, which guarantees their uniqueness
162+
(useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False""",
163+
example=True,
164+
),
165+
] = False,
155166
# -- chunking options --
156167
chunking_strategy: Annotated[
157168
Optional[Literal["by_title"]],
@@ -236,4 +247,5 @@ def as_form(
236247
new_after_n_chars=new_after_n_chars,
237248
overlap=overlap,
238249
overlap_all=overlap_all,
250+
unique_element_ids=unique_element_ids,
239251
)

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.65
2+
version: 0.0.66

sample-docs/fake-xml.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,10 @@
2424
<leader>Keith Rowley</leader>
2525
<sport>Track &amp; Field</sport>
2626
</country>
27+
<country>
28+
<name>Trinidad &amp; Tobado</name>
29+
<capital>Port of Spain</capital>
30+
<leader>Keith Rowley</leader>
31+
<sport>Track &amp; Field</sport>
32+
</country>
2733
</factbook>

test_general/api/test_app.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import uuid
12
from pathlib import Path
23
import os
34

@@ -313,6 +314,56 @@ def test_xml_keep_tags_param():
313314
assert element["text"].replace("&", "&amp;") in response_with_xml_tags["text"]
314315

315316

317+
def test_element_ids_by_default_non_unique():
318+
"""
319+
Verify that by default the element_ids aren't unique.
320+
"""
321+
client = TestClient(app)
322+
test_file = Path("sample-docs") / "fake-xml.xml"
323+
response = client.post(
324+
MAIN_API_ROUTE,
325+
files=[("files", (str(test_file), open(test_file, "rb")))],
326+
data={},
327+
)
328+
assert response.status_code == 200
329+
elements = response.json()
330+
331+
# Check that there are not unique ids by default, because this xml file has a
332+
# duplicated last element.
333+
ids = [element["element_id"] for element in elements]
334+
# If there are duplicate ids in the ids list, the count of resulting
335+
# set will be lower than the count of ids - which is expected here.
336+
assert len(ids) != len(set(ids)), "Elements have unique ids"
337+
338+
339+
def test_unique_element_ids_param():
340+
"""
341+
Verify that when requested, the element_ids are unique.
342+
"""
343+
client = TestClient(app)
344+
test_file = Path("sample-docs") / "fake-xml.xml"
345+
346+
response = client.post(
347+
MAIN_API_ROUTE,
348+
files=[("files", (str(test_file), open(test_file, "rb")))],
349+
data={
350+
"unique_element_ids": "True",
351+
},
352+
)
353+
assert response.status_code == 200
354+
elements = response.json()
355+
356+
ids = [element["element_id"] for element in elements]
357+
# If all ids are unique, the count of resulting set
358+
# will be same as the count of ids - which is expected here.
359+
assert len(ids) == len(set(ids)), "Elements have non-unique ids"
360+
361+
try:
362+
uuid.UUID(ids[0], version=4)
363+
except ValueError:
364+
raise AssertionError("Element ID is not in UUID format.")
365+
366+
316367
def test_include_page_breaks_param():
317368
"""
318369
Verify that responses do not include page breaks unless requested
@@ -539,6 +590,7 @@ def test_parallel_mode_passes_params(monkeypatch):
539590
"strategy": "hi_res",
540591
"xml_keep_tags": "True",
541592
"skip_infer_table_types": "foo",
593+
"unique_element_ids": "True",
542594
# -- chunking options --
543595
"chunking_strategy": "by_title",
544596
"combine_under_n_chars": "501",
@@ -567,6 +619,7 @@ def test_parallel_mode_passes_params(monkeypatch):
567619
skip_infer_table_types=["foo"],
568620
extract_image_block_types=None,
569621
extract_image_block_to_payload=False,
622+
unique_element_ids=True,
570623
# -- chunking options --
571624
chunking_strategy="by_title",
572625
combine_text_under_n_chars=501,

0 commit comments

Comments
 (0)