Skip to content

fix: DATA_DIR_KEY #214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@ __pycache__/
.idea/
openapi.json
openapi_client.json

# Environments
.env
.envrc
.venv*
venv*
env/
ENV/
env.bak/
23 changes: 14 additions & 9 deletions _test_unstructured_client/integration/test_decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import tempfile
from pathlib import Path
from typing import Literal

import httpx
import json
Expand All @@ -15,6 +16,7 @@
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared, operations
from unstructured_client.models.errors import HTTPValidationError
from unstructured_client.models.shared.partition_parameters import Strategy
from unstructured_client.utils.retries import BackoffStrategy, RetryConfig
from unstructured_client._hooks.custom import form_utils
from unstructured_client._hooks.custom import split_pdf_hook
Expand Down Expand Up @@ -105,19 +107,22 @@ def test_integration_split_pdf_has_same_output_as_non_split(
)
assert len(diff) == 0

@pytest.mark.parametrize( ("filename", "expected_ok", "strategy"), [
("_sample_docs/layout-parser-paper.pdf", True, "hi_res"), # 16
]# pages
)
@pytest.mark.parametrize( ("use_caching", "cache_dir"), [

@pytest.mark.parametrize(("filename", "expected_ok", "strategy"), [
("_sample_docs/layout-parser-paper.pdf", True, shared.Strategy.HI_RES), # 16 pages
])
@pytest.mark.parametrize(("use_caching", "cache_dir"), [
(True, None), # Use default cache dir
(True, Path(tempfile.gettempdir()) / "test_integration_unstructured_client1"), # Use custom cache dir
(False, None), # Don't use caching
(False, Path(tempfile.gettempdir()) / "test_integration_unstructured_client2"), # Don't use caching, use custom cache dir
])
def test_integration_split_pdf_with_caching(
filename: str, expected_ok: bool, strategy: str, use_caching: bool,
cache_dir: Path | None
filename: str,
expected_ok: bool,
strategy: Literal[Strategy.HI_RES],
use_caching: bool,
cache_dir: Path | None,
):
try:
response = requests.get("http://localhost:8000/general/docs")
Expand All @@ -140,10 +145,9 @@ def test_integration_split_pdf_with_caching(
parameters = shared.PartitionParameters(
files=files,
strategy=strategy,
languages=["eng"],
split_pdf_page=True,
split_pdf_cache_tmp_data=use_caching,
split_pdf_cache_dir=cache_dir,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This param doesn't exist, but the test succeeded despite that. Fixed the param anyway.

split_pdf_cache_tmp_data_dir=str(cache_dir),
)

req = operations.PartitionRequest(
Expand Down Expand Up @@ -185,6 +189,7 @@ def test_integration_split_pdf_with_caching(
if cache_dir:
assert not Path(cache_dir).exists()


@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"])
def test_long_pages_hi_res(filename):
req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters(
Expand Down
36 changes: 32 additions & 4 deletions _test_unstructured_client/unit/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
from __future__ import annotations

import asyncio
import io
import logging
from asyncio import Task
from collections import Counter
from functools import partial
from typing import Coroutine
from pathlib import Path
from unittest.mock import MagicMock, patch

import httpx
import pytest
import requests
from requests_toolbelt import MultipartDecoder, MultipartEncoder
from requests_toolbelt import MultipartDecoder

from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
from unstructured_client._hooks.custom.form_utils import (
FormData,
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
PARTITION_FORM_PAGE_RANGE_KEY,
)
from unstructured_client._hooks.custom.split_pdf_hook import (
DEFAULT_CACHE_TMP_DATA_DIR,
DEFAULT_CONCURRENCY_LEVEL,
DEFAULT_STARTING_PAGE_NUMBER,
MAX_CONCURRENCY_LEVEL,
Expand Down Expand Up @@ -434,3 +435,30 @@ async def test_remaining_tasks_cancelled_when_fails_disallowed():
await asyncio.sleep(1)
print("Cancelled amount: ", cancelled_counter["cancelled"])
assert len(tasks) > cancelled_counter["cancelled"] > 0


@patch("unstructured_client._hooks.custom.form_utils.Path")
def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path: MagicMock):
"""Test get_split_pdf_cache_tmp_data_dir uses the directory from the form data."""
# -- Create the form_data
dir_key = form_utils.PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY # -- "split_pdf_cache_tmp_data_dir"
mock_dir = "/mock/dir"
form_data: FormData = {dir_key: mock_dir}

# -- Mock the Path object in form_utils
mock_path_instance = MagicMock()
mock_path.return_value = mock_path_instance
mock_path_instance.exists.return_value = True
mock_path_instance.resolve.return_value = Path(mock_dir)

result = form_utils.get_split_pdf_cache_tmp_data_dir(
form_data = form_data,
key=dir_key,
fallback_value=DEFAULT_CACHE_TMP_DATA_DIR # -- tempfile.gettempdir()
)

assert dir_key == "split_pdf_cache_tmp_data_dir"
assert form_data.get(dir_key) == "/mock/dir"
mock_path.assert_called_once_with(mock_dir)
mock_path_instance.exists.assert_called_once()
assert result == str(Path(mock_dir).resolve())
7 changes: 7 additions & 0 deletions _test_unstructured_client/unit_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import pathlib
from typing import Any
from unittest.mock import (
ANY,
Expand Down Expand Up @@ -31,6 +32,12 @@
"property_mock",
)

def sample_docs_path(file_name: str) -> str:
"""Resolve the absolute-path to `file_name` in the sample-docs directory."""
sample_docs_dir = pathlib.Path(__file__).parent.parent / "_sample_docs"
file_path = sample_docs_dir / file_name
return str(file_path.resolve())


# ------------------------------------------------------------------------------------------------
# MOCKING FIXTURES
Expand Down
13 changes: 7 additions & 6 deletions src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
PARTITION_FORM_FILES_KEY,
PARTITION_FORM_PAGE_RANGE_KEY,
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY,
PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
)
from unstructured_client._hooks.types import (
AfterErrorContext,
Expand Down Expand Up @@ -315,7 +317,7 @@ def before_request(

self.cache_tmp_data_dir = form_utils.get_split_pdf_cache_tmp_data_dir(
form_data,
key=PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Main fix

key=PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY,
fallback_value=DEFAULT_CACHE_TMP_DATA_DIR,
)

Expand Down Expand Up @@ -546,7 +548,7 @@ def _get_pdf_chunk_paths(
return pdf_chunk_paths

def _get_pdf_chunk_files(
self, pdf_chunks: list[Tuple[Path, int]]
self, pdf_chunks: list[Tuple[Path, int]]
) -> Generator[Tuple[BinaryIO, int], None, None]:
"""Yields the file objects for the given pdf chunk paths.

Expand All @@ -573,8 +575,7 @@ def _get_pdf_chunk_files(
raise
yield pdf_chunk_file, offset

def _await_elements(
self, operation_id: str) -> Optional[list]:
def _await_elements(self, operation_id: str) -> Optional[list]:
"""
Waits for the partition requests to complete and returns the flattened
elements.
Expand Down
Loading