feat: add unique_element_ids param (Unstructured-IO#399)

hubert-rutkowski85 · web-flow · commit 0cadf93f62c9 · 2024-04-02T10:51:44.000-04:00
unique_element_ids added as API parameter, and passed into the library.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.0.66-dev0
+
+* Add support for `unique_element_ids` parameter.
+
 ## 0.0.65
 
 * Bump unstructured to 0.12.4
diff --git a/README.md b/README.md
@@ -208,7 +208,7 @@ various heuristics to detect the filetypes after uncompressing from .gz.
 When processing XML documents, set the `xml_keep_tags` parameter to `true` to retain the XML tags in the output. If not specified, it will simply extract the text from within the tags.
 
 ```
-curl -X 'POST' 
+curl -X 'POST' \
  'https://api.unstructured.io/general/v0/general' \
  -H 'accept: application/json'  \
  -H 'Content-Type: multipart/form-data' \
@@ -222,7 +222,7 @@ curl -X 'POST'
 For supported filetypes, set the `include_page_breaks` parameter to `true` to include `PageBreak` elements in the output.
 
 ```
-curl -X 'POST' 
+curl -X 'POST' \
  'https://api.unstructured.io/general/v0/general' \
  -H 'accept: application/json'  \
  -H 'Content-Type: multipart/form-data' \
@@ -232,6 +232,26 @@ curl -X 'POST'
 ```
 
 
+#### Unique element IDs
+
+By default, the element ID is a SHA-256 hash of the element text. This is to ensure that
+the ID is deterministic. One downside is that the ID is not guaranteed to be unique.
+Different elements with the same text will have the same ID, and there could also be hash collisions.
+To use UUIDs in the output instead, set ``unique_element_ids=true``. Note: this means that the element IDs
+will be random, so with every partition of the same file, you will get different IDs. 
+This can be helpful if you'd like to use the IDs as a primary key in a database, for example.
+
+```
+curl -X 'POST' \ 
+ 'https://api.unstructured.io/general/v0/general' \
+ -H 'accept: application/json'  \
+ -H 'Content-Type: multipart/form-data' \
+ -F 'files=@sample-docs/layout-parser-paper-fast.pdf' \
+ -F 'unique_element_ids=true' \
+ | jq -C . | less -R
+```
+
+
 #### Chunking Elements
 
 Use the `chunking_strategy` form-field to chunk text into larger or smaller elements. Defaults to `None` which performs no chunking. The available chunking strategies are `basic` and `by_title`.
diff --git a/openapi.json b/openapi.json
@@ -240,6 +240,11 @@
                         "default": [],
                         "description": "The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields",
                         "example": ["image", "table"]
+                    },
+                    "unique_element_ids": {
+                        "type": "boolean",
+                        "title": "Unique element IDs",
+                        "description": "When True, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False"
                     }
                 },
                 "type": "object",
diff --git a/prepline_general/api/app.py b/prepline_general/api/app.py
@@ -12,7 +12,7 @@
 app = FastAPI(
     title="Unstructured Pipeline API",
     summary="Partition documents with the Unstructured library",
-    version="0.0.65",
+    version="0.0.66",
     docs_url="/general/docs",
     openapi_url="/general/openapi.json",
     servers=[
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -299,6 +299,7 @@ def pipeline_api(
     xml_keep_tags: bool = False,
     languages: Optional[List[str]] = None,
     extract_image_block_types: Optional[List[str]] = None,
+    unique_element_ids: Optional[bool] = False,
 ) -> List[Dict[str, Any]] | str:
     if filename.endswith(".msg"):
         # Note(yuming): convert file type for msg files
@@ -333,6 +334,7 @@ def pipeline_api(
                         "xml_keep_tags": xml_keep_tags,
                         "languages": languages,
                         "extract_image_block_types": extract_image_block_types,
+                        "unique_element_ids": unique_element_ids,
                         "chunking_strategy": chunking_strategy,
                         "combine_under_n_chars": combine_under_n_chars,
                         "max_characters": max_characters,
@@ -390,6 +392,7 @@ def pipeline_api(
                         "overlap_all": overlap_all,
                         "extract_image_block_types": extract_image_block_types,
                         "extract_image_block_to_payload": extract_image_block_to_payload,
+                        "unique_element_ids": unique_element_ids,
                     },
                     default=str,
                 )
@@ -418,6 +421,7 @@ def pipeline_api(
             "overlap_all": overlap_all,
             "extract_image_block_types": extract_image_block_types,
             "extract_image_block_to_payload": extract_image_block_to_payload,
+            "unique_element_ids": unique_element_ids,
         }
 
         if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
@@ -692,7 +696,7 @@ def return_content_type(filename: str):
 
 
 @router.get("/general/v0/general", include_in_schema=False)
-@router.get("/general/v0.0.65/general", include_in_schema=False)
+@router.get("/general/v0.0.66/general", include_in_schema=False)
 async def handle_invalid_get_request():
     raise HTTPException(
         status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -707,7 +711,7 @@ async def handle_invalid_get_request():
     description="Description",
     operation_id="partition_parameters",
 )
-@router.post("/general/v0.0.65/general", include_in_schema=False)
+@router.post("/general/v0.0.66/general", include_in_schema=False)
 def general_partition(
     request: Request,
     # cannot use annotated type here because of a bug described here:
@@ -778,6 +782,7 @@ def response_generator(is_multipart: bool):
                 file_content_type=file_content_type,
                 languages=form_params.languages,
                 extract_image_block_types=form_params.extract_image_block_types,
+                unique_element_ids=form_params.unique_element_ids,
                 # -- chunking options --
                 chunking_strategy=chunking_strategy,
                 combine_under_n_chars=form_params.combine_under_n_chars,
diff --git a/prepline_general/api/models/form_params.py b/prepline_general/api/models/form_params.py
@@ -26,6 +26,8 @@ class GeneralFormParams(BaseModel):
     pdf_infer_table_structure: bool
     strategy: str
     extract_image_block_types: Optional[List[str]]
+    unique_element_ids: bool
+    # -- chunking options --
     chunking_strategy: Optional[str]
     combine_under_n_chars: Optional[int]
     max_characters: int
@@ -152,6 +154,15 @@ def as_form(
             ),
             BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
         ] = [],  # noqa
+        unique_element_ids: Annotated[
+            bool,
+            Form(
+                title="unique_element_ids",
+                description="""When `True`, assign UUIDs to element IDs, which guarantees their uniqueness 
+(useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False""",
+                example=True,
+            ),
+        ] = False,
         # -- chunking options --
         chunking_strategy: Annotated[
             Optional[Literal["by_title"]],
@@ -236,4 +247,5 @@ def as_form(
             new_after_n_chars=new_after_n_chars,
             overlap=overlap,
             overlap_all=overlap_all,
+            unique_element_ids=unique_element_ids,
         )
diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml
@@ -1,2 +1,2 @@
 name: general
-version: 0.0.65
+version: 0.0.66
diff --git a/sample-docs/fake-xml.xml b/sample-docs/fake-xml.xml
@@ -24,4 +24,10 @@
     <leader>Keith Rowley</leader>
     <sport>Track &amp; Field</sport>
   </country>
+  <country>
+    <name>Trinidad &amp; Tobado</name>
+    <capital>Port of Spain</capital>
+    <leader>Keith Rowley</leader>
+    <sport>Track &amp; Field</sport>
+  </country>
 </factbook>
diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
@@ -1,3 +1,4 @@
+import uuid
 from pathlib import Path
 import os
 
@@ -313,6 +314,56 @@ def test_xml_keep_tags_param():
         assert element["text"].replace("&", "&amp;") in response_with_xml_tags["text"]
 
 
+def test_element_ids_by_default_non_unique():
+    """
+    Verify that by default the element_ids aren't unique.
+    """
+    client = TestClient(app)
+    test_file = Path("sample-docs") / "fake-xml.xml"
+    response = client.post(
+        MAIN_API_ROUTE,
+        files=[("files", (str(test_file), open(test_file, "rb")))],
+        data={},
+    )
+    assert response.status_code == 200
+    elements = response.json()
+
+    # Check that there are not unique ids by default, because this xml file has a
+    # duplicated last element.
+    ids = [element["element_id"] for element in elements]
+    # If there are duplicate ids in the ids list, the count of resulting
+    # set will be lower than the count of ids - which is expected here.
+    assert len(ids) != len(set(ids)), "Elements have unique ids"
+
+
+def test_unique_element_ids_param():
+    """
+    Verify that when requested, the element_ids are unique.
+    """
+    client = TestClient(app)
+    test_file = Path("sample-docs") / "fake-xml.xml"
+
+    response = client.post(
+        MAIN_API_ROUTE,
+        files=[("files", (str(test_file), open(test_file, "rb")))],
+        data={
+            "unique_element_ids": "True",
+        },
+    )
+    assert response.status_code == 200
+    elements = response.json()
+
+    ids = [element["element_id"] for element in elements]
+    # If all ids are unique, the count of resulting set
+    # will be same as the count of ids - which is expected here.
+    assert len(ids) == len(set(ids)), "Elements have non-unique ids"
+
+    try:
+        uuid.UUID(ids[0], version=4)
+    except ValueError:
+        raise AssertionError("Element ID is not in UUID format.")
+
+
 def test_include_page_breaks_param():
     """
     Verify that responses do not include page breaks unless requested
@@ -539,6 +590,7 @@ def test_parallel_mode_passes_params(monkeypatch):
             "strategy": "hi_res",
             "xml_keep_tags": "True",
             "skip_infer_table_types": "foo",
+            "unique_element_ids": "True",
             # -- chunking options --
             "chunking_strategy": "by_title",
             "combine_under_n_chars": "501",
@@ -567,6 +619,7 @@ def test_parallel_mode_passes_params(monkeypatch):
         skip_infer_table_types=["foo"],
         extract_image_block_types=None,
         extract_image_block_to_payload=False,
+        unique_element_ids=True,
         # -- chunking options --
         chunking_strategy="by_title",
         combine_text_under_n_chars=501,

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`name: general`
`2`		`-version: 0.0.65`
	`2`	`+version: 0.0.66`