Skip to content

Commit a0d2113

Browse files
committed
refactoring of folders
1 parent 405f28e commit a0d2113

File tree

8 files changed

+238
-8
lines changed

8 files changed

+238
-8
lines changed

examples/extras/screenshot_scaping.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from scrapegraphai.utils.screenshot_scraping import take_screenshot, select_area_with_opencv, crop_image, detect_text
2+
import asyncio
3+
4+
# STEP 1: Take a screenshot
5+
image = asyncio.run(take_screenshot(
6+
url="https://colab.google/",
7+
save_path="Savedscreenshots/test_image.jpeg",
8+
quality = 50
9+
))
10+
11+
# STEP 2 (Optional): Select an area of the image which you want to use for text detection.
12+
LEFT, TOP, RIGHT, BOTTOM = select_area_with_opencv(image)
13+
print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM)
14+
15+
# STEP 3 (Optional): Crop the image.
16+
# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image.
17+
cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM)
18+
19+
# STEP 4: Detect text
20+
text = detect_text(
21+
cropped_image, # The image to detect text from
22+
languages = ["en"] # The languages to detect text in
23+
)
24+
25+
print("DETECTED TEXT: ")
26+
print(text)

pyproject.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@ more-browser-options = [
8888
"browserbase>=0.3.0",
8989
]
9090

91+
# Group 4: Surya Library
92+
screenshot_scraper = [
93+
"surya-ocr>=0.4.5",
94+
"matplotlib>=3.7.2",
95+
"ipywidgets>=8.1.0"
96+
]
97+
9198
[build-system]
9299
requires = ["hatchling"]
93100
build-backend = "hatchling.build"

requirements-dev.lock

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ anyio==4.4.0
2929
# via starlette
3030
astroid==3.2.4
3131
# via pylint
32+
asttokens==2.4.1
33+
# via stack-data
3234
async-timeout==4.0.3
3335
# via aiohttp
3436
# via langchain
@@ -64,12 +66,16 @@ click==8.1.7
6466
# via burr
6567
# via streamlit
6668
# via uvicorn
69+
comm==0.2.2
70+
# via ipywidgets
6771
contourpy==1.2.1
6872
# via matplotlib
6973
cycler==0.12.1
7074
# via matplotlib
7175
dataclasses-json==0.6.7
7276
# via langchain-community
77+
decorator==5.1.1
78+
# via ipython
7379
dill==0.3.8
7480
# via multiprocess
7581
# via pylint
@@ -79,7 +85,10 @@ docutils==0.19
7985
# via sphinx
8086
exceptiongroup==1.2.2
8187
# via anyio
88+
# via ipython
8289
# via pytest
90+
executing==2.0.1
91+
# via stack-data
8392
faiss-cpu==1.8.0.post1
8493
# via scrapegraphai
8594
fastapi==0.112.0
@@ -88,6 +97,10 @@ fastapi-pagination==0.12.26
8897
# via burr
8998
filelock==3.15.4
9099
# via huggingface-hub
100+
# via torch
101+
# via transformers
102+
filetype==1.2.0
103+
# via surya-ocr
91104
fonttools==4.53.1
92105
# via matplotlib
93106
free-proxy==1.1.1
@@ -97,6 +110,9 @@ frozenlist==1.4.1
97110
# via aiosignal
98111
fsspec==2024.6.1
99112
# via huggingface-hub
113+
# via torch
114+
ftfy==6.2.3
115+
# via surya-ocr
100116
furo==2024.5.6
101117
# via scrapegraphai
102118
gitdb==4.0.11
@@ -152,6 +168,7 @@ httpx-sse==0.4.0
152168
# via langchain-mistralai
153169
huggingface-hub==0.24.5
154170
# via tokenizers
171+
# via transformers
155172
idna==3.7
156173
# via anyio
157174
# via httpx
@@ -165,13 +182,20 @@ importlib-resources==6.4.0
165182
# via matplotlib
166183
iniconfig==2.0.0
167184
# via pytest
185+
ipython==8.18.1
186+
# via ipywidgets
187+
ipywidgets==8.1.5
188+
# via scrapegraphai
168189
isort==5.13.2
169190
# via pylint
191+
jedi==0.19.1
192+
# via ipython
170193
jinja2==3.1.4
171194
# via altair
172195
# via burr
173196
# via pydeck
174197
# via sphinx
198+
# via torch
175199
jiter==0.5.0
176200
# via openai
177201
jmespath==1.0.1
@@ -185,6 +209,8 @@ jsonschema==4.23.0
185209
# via altair
186210
jsonschema-specifications==2023.12.1
187211
# via jsonschema
212+
jupyterlab-widgets==3.0.13
213+
# via ipywidgets
188214
kiwisolver==1.4.5
189215
# via matplotlib
190216
langchain==0.2.14
@@ -226,6 +252,9 @@ marshmallow==3.21.3
226252
# via dataclasses-json
227253
matplotlib==3.9.1.post1
228254
# via burr
255+
# via scrapegraphai
256+
matplotlib-inline==0.1.7
257+
# via ipython
229258
mccabe==0.7.0
230259
# via pylint
231260
mdurl==0.1.2
@@ -234,6 +263,8 @@ minify-html==0.15.0
234263
# via scrapegraphai
235264
mpire==2.10.2
236265
# via semchunk
266+
mpmath==1.3.0
267+
# via sympy
237268
multidict==6.0.5
238269
# via aiohttp
239270
# via yarl
@@ -243,21 +274,27 @@ mypy-extensions==1.0.0
243274
# via typing-inspect
244275
narwhals==1.3.0
245276
# via altair
277+
networkx==3.2.1
278+
# via torch
246279
numpy==1.26.4
247280
# via contourpy
248281
# via faiss-cpu
249282
# via langchain
250283
# via langchain-aws
251284
# via langchain-community
252285
# via matplotlib
286+
# via opencv-python
253287
# via pandas
254288
# via pyarrow
255289
# via pydeck
256290
# via sf-hamilton
257291
# via streamlit
292+
# via transformers
258293
openai==1.40.3
259294
# via burr
260295
# via langchain-openai
296+
opencv-python==4.10.0.84
297+
# via surya-ocr
261298
orjson==3.10.7
262299
# via langsmith
263300
packaging==24.1
@@ -270,20 +307,28 @@ packaging==24.1
270307
# via pytest
271308
# via sphinx
272309
# via streamlit
310+
# via transformers
273311
pandas==2.2.2
274312
# via scrapegraphai
275313
# via sf-hamilton
276314
# via streamlit
315+
parso==0.8.4
316+
# via jedi
317+
pexpect==4.9.0
318+
# via ipython
277319
pillow==10.4.0
278320
# via matplotlib
279321
# via streamlit
322+
# via surya-ocr
280323
platformdirs==4.2.2
281324
# via pylint
282325
playwright==1.45.1
283326
# via scrapegraphai
284327
# via undetected-playwright
285328
pluggy==1.5.0
286329
# via pytest
330+
prompt-toolkit==3.0.47
331+
# via ipython
287332
proto-plus==1.24.0
288333
# via google-ai-generativelanguage
289334
# via google-api-core
@@ -295,6 +340,10 @@ protobuf==4.25.4
295340
# via grpcio-status
296341
# via proto-plus
297342
# via streamlit
343+
ptyprocess==0.7.0
344+
# via pexpect
345+
pure-eval==0.2.3
346+
# via stack-data
298347
pyarrow==17.0.0
299348
# via streamlit
300349
pyasn1==0.6.0
@@ -311,21 +360,28 @@ pydantic==2.8.2
311360
# via langchain-core
312361
# via langsmith
313362
# via openai
363+
# via pydantic-settings
364+
# via surya-ocr
314365
pydantic-core==2.20.1
315366
# via pydantic
367+
pydantic-settings==2.4.0
368+
# via surya-ocr
316369
pydeck==0.9.1
317370
# via streamlit
318371
pyee==11.1.0
319372
# via playwright
320373
pygments==2.18.0
321374
# via furo
375+
# via ipython
322376
# via mpire
323377
# via rich
324378
# via sphinx
325379
pylint==3.2.6
326380
pyparsing==3.1.2
327381
# via httplib2
328382
# via matplotlib
383+
pypdfium2==4.30.0
384+
# via surya-ocr
329385
pytest==8.0.0
330386
# via pytest-mock
331387
pytest-mock==3.14.0
@@ -334,19 +390,23 @@ python-dateutil==2.9.0.post0
334390
# via matplotlib
335391
# via pandas
336392
python-dotenv==1.0.1
393+
# via pydantic-settings
337394
# via scrapegraphai
395+
# via surya-ocr
338396
pytz==2024.1
339397
# via pandas
340398
pyyaml==6.0.2
341399
# via huggingface-hub
342400
# via langchain
343401
# via langchain-community
344402
# via langchain-core
403+
# via transformers
345404
referencing==0.35.1
346405
# via jsonschema
347406
# via jsonschema-specifications
348407
regex==2024.7.24
349408
# via tiktoken
409+
# via transformers
350410
requests==2.32.3
351411
# via burr
352412
# via free-proxy
@@ -358,6 +418,7 @@ requests==2.32.3
358418
# via sphinx
359419
# via streamlit
360420
# via tiktoken
421+
# via transformers
361422
rich==13.7.1
362423
# via streamlit
363424
rpds-py==0.20.0
@@ -367,11 +428,14 @@ rsa==4.9
367428
# via google-auth
368429
s3transfer==0.10.2
369430
# via boto3
431+
safetensors==0.4.4
432+
# via transformers
370433
semchunk==2.2.0
371434
# via scrapegraphai
372435
sf-hamilton==1.73.1
373436
# via burr
374437
six==1.16.0
438+
# via asttokens
375439
# via python-dateutil
376440
smmap==5.0.1
377441
# via gitdb
@@ -404,10 +468,18 @@ sphinxcontrib-serializinghtml==2.0.0
404468
sqlalchemy==2.0.32
405469
# via langchain
406470
# via langchain-community
471+
stack-data==0.6.3
472+
# via ipython
407473
starlette==0.37.2
408474
# via fastapi
409475
streamlit==1.37.1
410476
# via burr
477+
surya-ocr==0.5.0
478+
# via scrapegraphai
479+
sympy==1.13.2
480+
# via torch
481+
tabulate==0.9.0
482+
# via surya-ocr
411483
tenacity==8.5.0
412484
# via langchain
413485
# via langchain-community
@@ -418,13 +490,16 @@ tiktoken==0.7.0
418490
# via scrapegraphai
419491
tokenizers==0.19.1
420492
# via langchain-mistralai
493+
# via transformers
421494
toml==0.10.2
422495
# via streamlit
423496
tomli==2.0.1
424497
# via pylint
425498
# via pytest
426499
tomlkit==0.13.0
427500
# via pylint
501+
torch==2.4.0
502+
# via surya-ocr
428503
tornado==6.4.1
429504
# via streamlit
430505
tqdm==4.66.5
@@ -434,6 +509,14 @@ tqdm==4.66.5
434509
# via openai
435510
# via scrapegraphai
436511
# via semchunk
512+
# via transformers
513+
traitlets==5.14.3
514+
# via comm
515+
# via ipython
516+
# via ipywidgets
517+
# via matplotlib-inline
518+
transformers==4.44.2
519+
# via surya-ocr
437520
typing-extensions==4.12.2
438521
# via altair
439522
# via anyio
@@ -442,6 +525,7 @@ typing-extensions==4.12.2
442525
# via fastapi-pagination
443526
# via google-generativeai
444527
# via huggingface-hub
528+
# via ipython
445529
# via langchain-core
446530
# via openai
447531
# via pydantic
@@ -452,6 +536,7 @@ typing-extensions==4.12.2
452536
# via sqlalchemy
453537
# via starlette
454538
# via streamlit
539+
# via torch
455540
# via typing-inspect
456541
# via uvicorn
457542
typing-inspect==0.9.0
@@ -468,6 +553,11 @@ urllib3==1.26.19
468553
# via requests
469554
uvicorn==0.30.5
470555
# via burr
556+
wcwidth==0.2.13
557+
# via ftfy
558+
# via prompt-toolkit
559+
widgetsnbextension==4.0.13
560+
# via ipywidgets
471561
yarl==1.9.4
472562
# via aiohttp
473563
zipp==3.20.0

0 commit comments

Comments
 (0)