|
| 1 | +# Get Started |
| 2 | + |
| 3 | +## Verified File Types |
| 4 | +1. pptx/ppt |
| 5 | +2. docx/doc |
| 6 | +3. xlsx/xls |
| 7 | +4. xml |
| 8 | + |
| 9 | +## Setup Python |
| 10 | +1. Install Ananconda or Install Python |
| 11 | + |
| 12 | +2. Install LibreOffice |
| 13 | + * Install the full LibreOffice |
| 14 | + * Add the program to PATH. `Press "Windows" Key > Edit the system environment > Environment Variables > System variables > PATH > New > C:\Program Files\LibreOffice\program` |
| 15 | + |
| 16 | +2. Setup python dependencies |
| 17 | + ``` |
| 18 | + cd unstructured-api |
| 19 | + python -m pip install -r requirements/base.txt |
| 20 | + python -c "import nltk; nltk.download('punkt')" |
| 21 | + python -c "import nltk; nltk.download('averaged_perceptron_tagger')" |
| 22 | + ``` |
| 23 | +
|
| 24 | +3. Launch the app |
| 25 | + ``` |
| 26 | + python -m uvicorn prepline_general.api.app:app --reload --log-config logger_config.yaml |
| 27 | + ``` |
| 28 | +
|
| 29 | +## Compile Executable |
| 30 | +
|
| 31 | +### Windows |
| 32 | +
|
| 33 | +1. Install LibreOffice |
| 34 | + - Install the full LibreOffice |
| 35 | + - Add the program to PATH. `Press "Windows" Key > Edit the system environment > Environment Variables > System variables > PATH > New > C:\Program Files\LibreOffice\program` |
| 36 | +2. Create a python virtual environment |
| 37 | +3. Run the following commands. |
| 38 | + ```powershell |
| 39 | + cd <path/to/repo/> |
| 40 | + python -m pip install -r requirements/win-base.txt // uvloop does not support Windows |
| 41 | + python -c "import nltk; nltk.download('punkt', download_dir='nltk_data')" |
| 42 | + python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='nltk_data')" |
| 43 | + pip install httpx htmlBuilder |
| 44 | + ``` |
| 45 | +4. Download `sqlite3.dll` as a patch from [link](https://www.sqlite.org/download.html). Download [sqlite-dll-win-x64-3460000.zip](https://www.sqlite.org/2024/sqlite-dll-win-x64-3460000.zip). Somehow python native `sqlite3` does not work. Place it at the root of repository. Unzip the zip file. |
| 46 | +5. Convert the unstructuredioapi repo into a python package by updating the `pyproject.toml` |
| 47 | + ```pyproject.toml |
| 48 | + [build-system] |
| 49 | + # setuptools-scm considers all files tracked by git to be data files |
| 50 | + requires = ["setuptools>=62.0", "setuptools-scm"] |
| 51 | + build-backend = "setuptools.build_meta" |
| 52 | + |
| 53 | + [project] |
| 54 | + name = "prepline_general" |
| 55 | + description = "UnstructuredIO API" |
| 56 | + readme = "README.md" |
| 57 | + requires-python = "~=3.10" |
| 58 | + # keywords = ["one", "two"] |
| 59 | + license = { text = "Proprietary" } |
| 60 | + classifiers = [ # https://pypi.org/classifiers/ |
| 61 | + "Development Status :: 3 - Alpha", |
| 62 | + "Programming Language :: Python :: 3 :: Only", |
| 63 | + "Intended Audience :: Information Technology", |
| 64 | + "Operating System :: Unix", |
| 65 | + ] |
| 66 | + version="0.0.68" |
| 67 | + |
| 68 | + [tool.black] |
| 69 | + line-length = 100 |
| 70 | + |
| 71 | + [tool.pyright] |
| 72 | + pythonPlatform = "Linux" |
| 73 | + pythonVersion = "3.9" |
| 74 | + reportUnnecessaryCast = true |
| 75 | + typeCheckingMode = "strict" |
| 76 | + |
| 77 | + [tool.ruff] |
| 78 | + line-length = 100 |
| 79 | + select = [ |
| 80 | + "C4", # -- flake8-comprehensions -- |
| 81 | + "COM", # -- flake8-commas -- |
| 82 | + "E", # -- pycodestyle errors -- |
| 83 | + "F", # -- pyflakes -- |
| 84 | + "I", # -- isort (imports) -- |
| 85 | + "PLR0402", # -- Name compared with itself like `foo == foo` -- |
| 86 | + "PT", # -- flake8-pytest-style -- |
| 87 | + "SIM", # -- flake8-simplify -- |
| 88 | + "UP015", # -- redundant `open()` mode parameter (like "r" is default) -- |
| 89 | + "UP018", # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) -- |
| 90 | + "UP032", # -- Use f-string instead of `.format()` call -- |
| 91 | + "UP034", # -- Avoid extraneous parentheses -- |
| 92 | + ] |
| 93 | + ignore = [ |
| 94 | + "COM812", # -- over aggressively insists on trailing commas where not desireable -- |
| 95 | + "PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception -- |
| 96 | + "PT012", # -- pytest.raises() block should contain a single simple statement -- |
| 97 | + "SIM117", # -- merge `with` statements for context managers that have same scope -- |
| 98 | + ] |
| 99 | + |
| 100 | + [tool.ruff.lint.isort] |
| 101 | + known-first-party = [ |
| 102 | + "unstructured", |
| 103 | + "unstructured_inference", |
| 104 | + ] |
| 105 | + |
| 106 | + [tool.setuptools.packages.find] |
| 107 | + where = ["."] |
| 108 | + ``` |
| 109 | +6. Install the repo as a package. `python -m pip install -e . ` |
| 110 | +7. Create a pyinstaller spec file called `unstructured_api.spec` with following content. Modify the variables based on the comments found in the spec file. |
| 111 | + ```unstructuredio_api.spec |
| 112 | + # -*- mode: python ; coding: utf-8 -*- |
| 113 | +
|
| 114 | + import os |
| 115 | + from pathlib import Path |
| 116 | + import sys |
| 117 | + from PyInstaller.utils.hooks import collect_all |
| 118 | +
|
| 119 | + binaries_list = [ |
| 120 | + ('C:\\Program Files\\LibreOffice\\program', 'libreoffice'), # modify this to point to where the LibreOffice is installed |
| 121 | + (Path('sqlite-dll-win-x64-3460000/sqlite3.dll').as_posix(), '.'), # modify this to point to where you unzip the sqlite3.dll |
| 122 | + (Path('sqlite-dll-win-x64-3460000/sqlite3.def').as_posix(), '.'), # modify this to point to where you unzip the sqlite3.def |
| 123 | +
|
| 124 | + ] |
| 125 | +
|
| 126 | + datas_list = [ |
| 127 | + (Path('logger_config.yaml').as_posix(), 'config'), # modify this to point to where the repo is |
| 128 | + (Path('nltk_data').as_posix(), 'nltk_data') # modify this to point to where nltk download the nltk data |
| 129 | + ] |
| 130 | +
|
| 131 | + hiddenimports_list = [] |
| 132 | +
|
| 133 | + def add_package(package_name): |
| 134 | + datas, binaries, hiddenimports = collect_all(package_name) |
| 135 | + datas_list.extend(datas) |
| 136 | + binaries_list.extend(binaries) |
| 137 | + hiddenimports_list.extend(hiddenimports) |
| 138 | +
|
| 139 | + # Collect all resources from the package_name |
| 140 | + add_package('unstructured') |
| 141 | + add_package('effdet') |
| 142 | + add_package('onnxruntime') |
| 143 | + add_package('encodings') |
| 144 | + add_package('prepline_general') |
| 145 | +
|
| 146 | + a = Analysis( |
| 147 | + [Path('unstructuredio_api.py').as_posix()], |
| 148 | + pathex=[], |
| 149 | + binaries=binaries_list, |
| 150 | + datas=datas_list, |
| 151 | + hiddenimports=hiddenimports_list, |
| 152 | + hookspath=[], |
| 153 | + hooksconfig={}, |
| 154 | + runtime_hooks=[], |
| 155 | + excludes=[], |
| 156 | + noarchive=False, |
| 157 | + optimize=0, |
| 158 | + ) |
| 159 | + pyz = PYZ(a.pure) |
| 160 | +
|
| 161 | + exe = EXE( |
| 162 | + pyz, |
| 163 | + a.scripts, |
| 164 | + [], |
| 165 | + exclude_binaries=True, |
| 166 | + name='unstructuredio_api', |
| 167 | + debug=False, |
| 168 | + bootloader_ignore_signals=False, |
| 169 | + strip=False, |
| 170 | + upx=True, |
| 171 | + console=True, |
| 172 | + disable_windowed_traceback=False, |
| 173 | + argv_emulation=False, |
| 174 | + target_arch=None, |
| 175 | + codesign_identity=None, |
| 176 | + entitlements_file=None, |
| 177 | + ) |
| 178 | + coll = COLLECT( |
| 179 | + exe, |
| 180 | + a.binaries, |
| 181 | + a.datas, |
| 182 | + strip=False, |
| 183 | + upx=True, |
| 184 | + upx_exclude=[], |
| 185 | + name='unstructuredio_api', |
| 186 | + ) |
| 187 | + ``` |
| 188 | +8. Create a `unstructuredio_api.py` file |
| 189 | + ```python |
| 190 | + import uvicorn |
| 191 | + import os |
| 192 | + |
| 193 | + if __name__ == "__main__": |
| 194 | + uvicorn.run( |
| 195 | + "prepline_general.api.app:app", |
| 196 | + port=6989, |
| 197 | + host="0.0.0.0", |
| 198 | + log_config=os.path.join("_internal", "config", "logger_config.yaml") |
| 199 | + ) |
| 200 | + ``` |
| 201 | +9. Remove relative path with absolute path. |
| 202 | + - `prepline_general/api/app.py` |
| 203 | + ```python |
| 204 | + ... |
| 205 | +
|
| 206 | + #from .general import router as general_router |
| 207 | + #from .openapi import set_custom_openapi |
| 208 | +
|
| 209 | + from prepline_general.api.general import router as general_router |
| 210 | + from prepline_general.api.openapi import set_custom_openapi |
| 211 | + ... |
| 212 | + ``` |
| 213 | +9. Install pyinstaller. `pip install pyinstaller` |
| 214 | +10. Start packaging. `pyinstaller .\unstructuredio_api.spec` |
0 commit comments