Skip to content

Move Split PDF page functionality to hooks #69

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b78ff6f
requests-toolbelt dependency
mpolomdeepsense Apr 22, 2024
5dd920e
Move hooks to separate subdirectory and refactor hooks registration
mpolomdeepsense Apr 22, 2024
74ffbd7
Split pdf hook
mpolomdeepsense Apr 22, 2024
0207dfd
Remove previous implementation of split pdf page functionality
mpolomdeepsense Apr 22, 2024
b4829f0
Add missing new line at the end of a file
mpolomdeepsense Apr 22, 2024
daf7f5e
Fix _clear_operation function indentation
mpolomdeepsense Apr 22, 2024
6ca1451
Speakeasy generate client output
mpolomdeepsense Apr 22, 2024
a73fbe3
Remove TypeAlias import; fixes python 3.9 build
mpolomdeepsense Apr 22, 2024
d15518b
Support default type annotations in older python versions
mpolomdeepsense Apr 22, 2024
8c7ced7
Fix logging initialization
mpolomdeepsense Apr 22, 2024
c038dfd
Fix parallel api calls
mpolomdeepsense Apr 22, 2024
e3ce00c
Added logging in case of an error
mpolomdeepsense Apr 22, 2024
a182d19
Replace relative imports
mpolomdeepsense Apr 23, 2024
846941f
Reason to removing file extension from file name
mpolomdeepsense Apr 23, 2024
9d5d532
Code cleanup
mpolomdeepsense Apr 23, 2024
2e438e8
Explained why last page is skipped when sending parallel requests
mpolomdeepsense Apr 23, 2024
25f1a96
split pdf hook tests
mpolomdeepsense Apr 23, 2024
9042e99
Improve pdf file validation before splitting
mpolomdeepsense Apr 24, 2024
2b850fb
Rollback to port 8000
mpolomdeepsense Apr 24, 2024
b017cf4
Use Speakeasy Files instead of a custom File class
mpolomdeepsense Apr 24, 2024
7263f0e
Unit tests for _is_pdf method of split_pdf_hook
mpolomdeepsense Apr 24, 2024
a2ba3ae
Changed after error hooks order. We don't want to log error in case l…
mpolomdeepsense Apr 24, 2024
9f920dc
Fix for when split_pdf_page is True and one paged pdf is sent. Docume…
mpolomdeepsense Apr 24, 2024
24eae11
Test comment for more code understanding
mpolomdeepsense Apr 24, 2024
6ddac3e
Handling an edge case for when a filename would be an empty string
mpolomdeepsense Apr 24, 2024
4610465
Made empty filename edge case more robust
mpolomdeepsense Apr 24, 2024
1db74d3
Handle edge case for when user inputs empty filename
mpolomdeepsense Apr 25, 2024
1f14e24
Split pdf hook unit tests description
mpolomdeepsense Apr 25, 2024
edbed11
Replace relative imports
mpolomdeepsense Apr 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .speakeasy/gen.lock
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
lockVersion: 2.0.0
id: 8b5fa338-9106-4734-abf0-e30d67044a90
management:
docChecksum: 33cfd4e27a32bf67fdb56996b6bd7a7a
docChecksum: b35264eb5f2ce89c808012333367cf1c
docVersion: 0.0.1
speakeasyVersion: 1.241.0
generationVersion: 2.300.0
releaseVersion: 0.23.1
configChecksum: fabd3c02a49e5b9a6a10fa261a46c3c9
speakeasyVersion: 1.267.1
generationVersion: 2.312.1
releaseVersion: 0.23.2
configChecksum: c0ddfa44eb8fbd51d397d36253d1d68f
repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
repoSubDirectory: .
installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git
published: true
features:
python:
core: 4.6.3
core: 4.6.5
examples: 2.81.3
globalSecurity: 2.83.5
globalServerURLs: 2.82.2
nameOverrides: 2.81.2
responseFormat: 0.1.0
retries: 2.82.1
retries: 2.82.2
serverIDs: 2.81.1
unions: 2.82.6
generatedFiles:
Expand Down
67 changes: 51 additions & 16 deletions _test_unstructured_client/test__decorators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import pypdf
import pytest
import requests
from deepdiff import DeepDiff
Expand All @@ -15,18 +14,18 @@
@pytest.mark.parametrize(
"filename, expected_ok",
[
("_sample_docs/list-item-example-1.pdf", True), # 1 page
("_sample_docs/list-item-example-1.pdf", True), # 1 page
("_sample_docs/layout-parser-paper-fast.pdf", True), # 2 pages
("_sample_docs/layout-parser-paper.pdf", True), # 16 pages
("_sample_docs/layout-parser-paper.pdf", True), # 16 pages
("_sample_docs/fake.doc", True),
("_sample_docs/fake.doc", False), # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
(
"_sample_docs/fake.doc",
False,
), # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
],
)
def test_integration_split_pdf_has_same_output_as_non_split(
call_threads: int,
filename: str,
expected_ok: bool,
caplog
call_threads: int, filename: str, expected_ok: bool, caplog
):
"""
Tests that output that we get from the split-by-page pdf is the same as from non-split.
Expand All @@ -36,14 +35,13 @@ def test_integration_split_pdf_has_same_output_as_non_split(
"""
try:
response = requests.get("http://localhost:8000/general/docs")
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
assert (
response.status_code == 200
), "The unstructured-api is not running on localhost:8000"
except requests.exceptions.ConnectionError:
assert False, "The unstructured-api is not running on localhost:8000"

client = UnstructuredClient(
api_key_auth=FAKE_KEY,
server_url="localhost:8000"
)
client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")

with open(filename, "rb") as f:
files = shared.Files(
Expand All @@ -56,7 +54,7 @@ def test_integration_split_pdf_has_same_output_as_non_split(

req = shared.PartitionParameters(
files=files,
strategy='fast',
strategy="fast",
languages=["eng"],
split_pdf_page=True,
)
Expand All @@ -81,6 +79,43 @@ def test_integration_split_pdf_has_same_output_as_non_split(
assert resp_split.status_code == resp_single.status_code

# Difference in the parent_id is expected, because parent_ids are assigned when element crosses page boundary
diff = DeepDiff(t1=resp_split.elements, t2=resp_single.elements,
exclude_regex_paths=r"root\[\d+\]\['metadata'\]\['parent_id'\]")
diff = DeepDiff(
t1=resp_split.elements,
t2=resp_single.elements,
exclude_regex_paths=[
r"root\[\d+\]\['metadata'\]\['parent_id'\]",
# TODO: (Marek Połom) - Remove page number pattern after page numbering parameter is added
r"root\[\d+\]\['metadata'\]\['page_number'\]",
],
)
assert len(diff) == 0


def test_integration_split_pdf_for_file_with_no_name():
"""
Tests that the client raises an error when the file_name is empty.
"""
try:
response = requests.get("http://localhost:8000/general/docs")
assert (
response.status_code == 200
), "The unstructured-api is not running on localhost:8000"
except requests.exceptions.ConnectionError:
assert False, "The unstructured-api is not running on localhost:8000"

client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")

with open("_sample_docs/layout-parser-paper-fast.pdf", "rb") as f:
files = shared.Files(
content=f.read(),
file_name=" ",
)

req = shared.PartitionParameters(
files=files,
strategy="fast",
languages=["eng"],
split_pdf_page=True,
)

pytest.raises(ValueError, client.general.partition, req)
Loading