Skip to content

Commit 7f4189a

Browse files
committed
Document Windows Executable Compilation
1 parent 8bdb1de commit 7f4189a

File tree

3 files changed

+691
-0
lines changed

3 files changed

+691
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,6 @@ celerybeat-schedule.db
145145

146146
# temporarily generated files by project-specific Makefile
147147
tmp*
148+
149+
# pyinstaller
150+
nltk_data

docs/Windows.md

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
# Get Started
2+
3+
## Verified File Types
4+
1. pptx/ppt
5+
2. docx/doc
6+
3. xlsx/xls
7+
4. xml
8+
9+
## Setup Python
10+
1. Install Ananconda or Install Python
11+
12+
2. Install LibreOffice
13+
* Install the full LibreOffice
14+
* Add the program to PATH. `Press "Windows" Key > Edit the system environment > Environment Variables > System variables > PATH > New > C:\Program Files\LibreOffice\program`
15+
16+
2. Setup python dependencies
17+
```
18+
cd unstructured-api
19+
python -m pip install -r requirements/base.txt
20+
python -c "import nltk; nltk.download('punkt')"
21+
python -c "import nltk; nltk.download('averaged_perceptron_tagger')"
22+
```
23+
24+
3. Launch the app
25+
```
26+
python -m uvicorn prepline_general.api.app:app --reload --log-config logger_config.yaml
27+
```
28+
29+
## Compile Executable
30+
31+
### Windows
32+
33+
1. Install LibreOffice
34+
- Install the full LibreOffice
35+
- Add the program to PATH. `Press "Windows" Key > Edit the system environment > Environment Variables > System variables > PATH > New > C:\Program Files\LibreOffice\program`
36+
2. Create a python virtual environment
37+
3. Run the following commands.
38+
```powershell
39+
cd <path/to/repo/>
40+
python -m pip install -r requirements/win-base.txt // uvloop does not support Windows
41+
python -c "import nltk; nltk.download('punkt', download_dir='nltk_data')"
42+
python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='nltk_data')"
43+
pip install httpx htmlBuilder
44+
```
45+
4. Download `sqlite3.dll` as a patch from [link](https://www.sqlite.org/download.html). Download [sqlite-dll-win-x64-3460000.zip](https://www.sqlite.org/2024/sqlite-dll-win-x64-3460000.zip). Somehow python native `sqlite3` does not work. Place it at the root of repository. Unzip the zip file.
46+
5. Convert the unstructuredioapi repo into a python package by updating the `pyproject.toml`
47+
```pyproject.toml
48+
[build-system]
49+
# setuptools-scm considers all files tracked by git to be data files
50+
requires = ["setuptools>=62.0", "setuptools-scm"]
51+
build-backend = "setuptools.build_meta"
52+
53+
[project]
54+
name = "prepline_general"
55+
description = "UnstructuredIO API"
56+
readme = "README.md"
57+
requires-python = "~=3.10"
58+
# keywords = ["one", "two"]
59+
license = { text = "Proprietary" }
60+
classifiers = [ # https://pypi.org/classifiers/
61+
"Development Status :: 3 - Alpha",
62+
"Programming Language :: Python :: 3 :: Only",
63+
"Intended Audience :: Information Technology",
64+
"Operating System :: Unix",
65+
]
66+
version="0.0.68"
67+
68+
[tool.black]
69+
line-length = 100
70+
71+
[tool.pyright]
72+
pythonPlatform = "Linux"
73+
pythonVersion = "3.9"
74+
reportUnnecessaryCast = true
75+
typeCheckingMode = "strict"
76+
77+
[tool.ruff]
78+
line-length = 100
79+
select = [
80+
"C4", # -- flake8-comprehensions --
81+
"COM", # -- flake8-commas --
82+
"E", # -- pycodestyle errors --
83+
"F", # -- pyflakes --
84+
"I", # -- isort (imports) --
85+
"PLR0402", # -- Name compared with itself like `foo == foo` --
86+
"PT", # -- flake8-pytest-style --
87+
"SIM", # -- flake8-simplify --
88+
"UP015", # -- redundant `open()` mode parameter (like "r" is default) --
89+
"UP018", # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) --
90+
"UP032", # -- Use f-string instead of `.format()` call --
91+
"UP034", # -- Avoid extraneous parentheses --
92+
]
93+
ignore = [
94+
"COM812", # -- over aggressively insists on trailing commas where not desireable --
95+
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
96+
"PT012", # -- pytest.raises() block should contain a single simple statement --
97+
"SIM117", # -- merge `with` statements for context managers that have same scope --
98+
]
99+
100+
[tool.ruff.lint.isort]
101+
known-first-party = [
102+
"unstructured",
103+
"unstructured_inference",
104+
]
105+
106+
[tool.setuptools.packages.find]
107+
where = ["."]
108+
```
109+
6. Install the repo as a package. `python -m pip install -e . `
110+
7. Create a pyinstaller spec file called `unstructured_api.spec` with following content. Modify the variables based on the comments found in the spec file.
111+
```unstructuredio_api.spec
112+
# -*- mode: python ; coding: utf-8 -*-
113+
114+
import os
115+
from pathlib import Path
116+
import sys
117+
from PyInstaller.utils.hooks import collect_all
118+
119+
binaries_list = [
120+
('C:\\Program Files\\LibreOffice\\program', 'libreoffice'), # modify this to point to where the LibreOffice is installed
121+
(Path('sqlite-dll-win-x64-3460000/sqlite3.dll').as_posix(), '.'), # modify this to point to where you unzip the sqlite3.dll
122+
(Path('sqlite-dll-win-x64-3460000/sqlite3.def').as_posix(), '.'), # modify this to point to where you unzip the sqlite3.def
123+
124+
]
125+
126+
datas_list = [
127+
(Path('logger_config.yaml').as_posix(), 'config'), # modify this to point to where the repo is
128+
(Path('nltk_data').as_posix(), 'nltk_data') # modify this to point to where nltk download the nltk data
129+
]
130+
131+
hiddenimports_list = []
132+
133+
def add_package(package_name):
134+
datas, binaries, hiddenimports = collect_all(package_name)
135+
datas_list.extend(datas)
136+
binaries_list.extend(binaries)
137+
hiddenimports_list.extend(hiddenimports)
138+
139+
# Collect all resources from the package_name
140+
add_package('unstructured')
141+
add_package('effdet')
142+
add_package('onnxruntime')
143+
add_package('encodings')
144+
add_package('prepline_general')
145+
146+
a = Analysis(
147+
[Path('unstructuredio_api.py').as_posix()],
148+
pathex=[],
149+
binaries=binaries_list,
150+
datas=datas_list,
151+
hiddenimports=hiddenimports_list,
152+
hookspath=[],
153+
hooksconfig={},
154+
runtime_hooks=[],
155+
excludes=[],
156+
noarchive=False,
157+
optimize=0,
158+
)
159+
pyz = PYZ(a.pure)
160+
161+
exe = EXE(
162+
pyz,
163+
a.scripts,
164+
[],
165+
exclude_binaries=True,
166+
name='unstructuredio_api',
167+
debug=False,
168+
bootloader_ignore_signals=False,
169+
strip=False,
170+
upx=True,
171+
console=True,
172+
disable_windowed_traceback=False,
173+
argv_emulation=False,
174+
target_arch=None,
175+
codesign_identity=None,
176+
entitlements_file=None,
177+
)
178+
coll = COLLECT(
179+
exe,
180+
a.binaries,
181+
a.datas,
182+
strip=False,
183+
upx=True,
184+
upx_exclude=[],
185+
name='unstructuredio_api',
186+
)
187+
```
188+
8. Create a `unstructuredio_api.py` file
189+
```python
190+
import uvicorn
191+
import os
192+
193+
if __name__ == "__main__":
194+
uvicorn.run(
195+
"prepline_general.api.app:app",
196+
port=6989,
197+
host="0.0.0.0",
198+
log_config=os.path.join("_internal", "config", "logger_config.yaml")
199+
)
200+
```
201+
9. Remove relative path with absolute path.
202+
- `prepline_general/api/app.py`
203+
```python
204+
...
205+
206+
#from .general import router as general_router
207+
#from .openapi import set_custom_openapi
208+
209+
from prepline_general.api.general import router as general_router
210+
from prepline_general.api.openapi import set_custom_openapi
211+
...
212+
```
213+
9. Install pyinstaller. `pip install pyinstaller`
214+
10. Start packaging. `pyinstaller .\unstructuredio_api.spec`

0 commit comments

Comments
 (0)