Skip to content

Commit 388630c

Browse files
committed
fix: screenshot scraper
1 parent a0d2113 commit 388630c

File tree

9 files changed

+46
-39
lines changed

9 files changed

+46
-39
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ This group includes additional browser management options, such as BrowserBase.
5454
pip install scrapegraphai[more-browser-options]
5555
```
5656

57+
### Installing "More Browser Options"
58+
59+
This group includes an ocr scraper for websites
60+
```bash
61+
pip install scrapegraphai[screenshot_scraper]
62+
```
63+
5764
## 💻 Usage
5865
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
5966

Loading

examples/extras/screenshot_scaping.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
1-
from scrapegraphai.utils.screenshot_scraping import take_screenshot, select_area_with_opencv, crop_image, detect_text
1+
"""
2+
example of scraping with screenshots
3+
"""
24
import asyncio
5+
from scrapegraphai.utils.screenshot_scraping import (take_screenshot,
6+
select_area_with_opencv,
7+
crop_image, detect_text)
38

49
# STEP 1: Take a screenshot
510
image = asyncio.run(take_screenshot(
@@ -13,14 +18,15 @@
1318
print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM)
1419

1520
# STEP 3 (Optional): Crop the image.
16-
# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image.
21+
# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
22+
# it will be set to the corresponding edge of the image.
1723
cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM)
1824

1925
# STEP 4: Detect text
20-
text = detect_text(
26+
TEXT = detect_text(
2127
cropped_image, # The image to detect text from
2228
languages = ["en"] # The languages to detect text in
2329
)
2430

2531
print("DETECTED TEXT: ")
26-
print(text)
32+
print(TEXT)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ more-browser-options = [
9090

9191
# Group 4: Surya Library
9292
screenshot_scraper = [
93-
"surya-ocr>=0.4.5",
93+
"surya-ocr>=0.5.0",
9494
"matplotlib>=3.7.2",
9595
"ipywidgets>=8.1.0"
9696
]

requirements-dev.lock

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,6 @@ idna==3.7
176176
# via yarl
177177
imagesize==1.4.1
178178
# via sphinx
179-
importlib-metadata==8.2.0
180-
# via sphinx
181-
importlib-resources==6.4.0
182-
# via matplotlib
183179
iniconfig==2.0.0
184180
# via pytest
185181
ipython==8.18.1
@@ -525,16 +521,13 @@ typing-extensions==4.12.2
525521
# via fastapi-pagination
526522
# via google-generativeai
527523
# via huggingface-hub
528-
# via ipython
529524
# via langchain-core
530525
# via openai
531526
# via pydantic
532527
# via pydantic-core
533528
# via pyee
534-
# via pylint
535529
# via sf-hamilton
536530
# via sqlalchemy
537-
# via starlette
538531
# via streamlit
539532
# via torch
540533
# via typing-inspect
@@ -560,6 +553,3 @@ widgetsnbextension==4.0.13
560553
# via ipywidgets
561554
yarl==1.9.4
562555
# via aiohttp
563-
zipp==3.20.0
564-
# via importlib-metadata
565-
# via importlib-resources

requirements.lock

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,6 @@ idna==3.7
132132
# via httpx
133133
# via requests
134134
# via yarl
135-
importlib-resources==6.4.4
136-
# via matplotlib
137135
ipython==8.18.1
138136
# via ipywidgets
139137
ipywidgets==8.1.5
@@ -372,7 +370,6 @@ typing-extensions==4.12.2
372370
# via anyio
373371
# via google-generativeai
374372
# via huggingface-hub
375-
# via ipython
376373
# via langchain-core
377374
# via openai
378375
# via pydantic
@@ -399,5 +396,3 @@ widgetsnbextension==4.0.13
399396
# via ipywidgets
400397
yarl==1.9.4
401398
# via aiohttp
402-
zipp==3.20.1
403-
# via importlib-resources
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
2+
from .text_detection import detect_text

scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
"""
2+
screenshot_preparation module
3+
"""
14
import asyncio
2-
from playwright.async_api import async_playwright
3-
45
from io import BytesIO
56
from PIL import Image, ImageGrab
6-
7+
from playwright.async_api import async_playwright
8+
import cv2 as cv
9+
import numpy as np
10+
from io import BytesIO
711

812
async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
913
"""
@@ -20,23 +24,24 @@ async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
2024
browser = await p.chromium.launch(headless=True)
2125
page = await browser.new_page()
2226
await page.goto(url)
23-
image_bytes = await page.screenshot(path=save_path, type="jpeg", full_page=True, quality=quality)
27+
image_bytes = await page.screenshot(path=save_path,
28+
type="jpeg",
29+
full_page=True,
30+
quality=quality)
2431
await browser.close()
2532
return Image.open(BytesIO(image_bytes))
2633

27-
2834
def select_area_with_opencv(image):
2935
"""
30-
Allows you to manually select an image area using OpenCV. It is recommended to use this function if your project is on your computer, otherwise use select_area_with_ipywidget().
36+
Allows you to manually select an image area using OpenCV.
37+
It is recommended to use this function if your project is on your computer,
38+
otherwise use select_area_with_ipywidget().
3139
Parameters:
3240
image (PIL.Image): The image from which to select an area.
3341
Returns:
3442
A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area.
3543
"""
3644

37-
import cv2 as cv
38-
import numpy as np
39-
4045
fullscreen_screenshot = ImageGrab.grab()
4146
dw, dh = fullscreen_screenshot.size
4247

@@ -100,7 +105,9 @@ def draw_selection_rectanlge(event, x, y, flags, param):
100105

101106
def select_area_with_ipywidget(image):
102107
"""
103-
Allows you to manually select an image area using ipywidgets. It is recommended to use this function if your project is in Google Colab, Kaggle or other similar platform, otherwise use select_area_with_opencv().
108+
Allows you to manually select an image area using ipywidgets.
109+
It is recommended to use this function if your project is in Google Colab,
110+
Kaggle or other similar platform, otherwise use select_area_with_opencv().
104111
Parameters:
105112
image (PIL Image): The input image.
106113
Returns:
@@ -183,13 +190,15 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path:
183190
image (PIL.Image): The image to be cropped.
184191
LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None.
185192
TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None.
186-
RIGHT (int, optional): The x-coordinate of the right edge of the crop area. Defaults to None.
193+
RIGHT (int, optional): The x-coordinate of
194+
the right edge of the crop area. Defaults to None.
187195
BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None.
188196
save_path (str, optional): The path to save the cropped image. Defaults to None.
189197
Returns:
190198
PIL.Image: The cropped image.
191199
Notes:
192-
If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image.
200+
If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
201+
it will be set to the corresponding edge of the image.
193202
If save_path is specified, the cropped image will be saved as a JPEG file at the specified path.
194203
"""
195204

@@ -208,5 +217,3 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path:
208217
croped_image.save(save_path, "JPEG")
209218

210219
return image.crop((LEFT, TOP, RIGHT, BOTTOM))
211-
212-

scrapegraphai/utils/screenshot_scraping/text_detection.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
"""
2+
text_detection_module
3+
"""
14
from surya.ocr import run_ocr
2-
import numpy as np
3-
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
5+
from surya.model.detection.model import (load_model as load_det_model,
6+
load_processor as load_det_processor)
47
from surya.model.recognition.model import load_model as load_rec_model
58
from surya.model.recognition.processor import load_processor as load_rec_processor
69

@@ -22,8 +25,5 @@ def detect_text(image, languages: list = ["en"]):
2225
rec_model, rec_processor = load_rec_model(), load_rec_processor()
2326
predictions = run_ocr([image], [langs], det_model,
2427
det_processor, rec_model, rec_processor)
25-
2628
text = "\n".join([line.text for line in predictions[0].text_lines])
2729
return text
28-
29-

0 commit comments

Comments
 (0)