-
-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Screenshot scraping #606
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
VinciGit00
merged 12 commits into
ScrapeGraphAI:screenshot-scraper-fix
from
Santabot123:pre/beta
Aug 30, 2024
Merged
Screenshot scraping #606
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
49ae56c
Added screenshot preparation script for screenshot scraping
Santabot123 e11f0cd
Added text_detection.py and updated screenshot_preparation.py
Santabot123 b4f8ea4
add __init__.py and docstrings
Santabot123 0cf7c44
correct the typo and updated select_area_with_ipywidget()
Santabot123 7e23c3d
correct the typo
Santabot123 c0a0e69
remove some comments and image
Santabot123 92bec28
Updated requirements.txt
Santabot123 aa9e85f
remove some comments and image
Santabot123 90d7549
updated requirements.txt
Santabot123 6e9911c
Merge branch 'pre/beta' of https://github.com/Santabot123/Scrapegraph…
Santabot123 55a7727
Update requirements.txt
Santabot123 735120d
Merge branch 'screenshot-scraper-fix' into pre/beta
VinciGit00 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
""" | ||
__init__.py file for screenshot_scraping folder | ||
""" | ||
|
||
|
||
from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image | ||
from .text_detection import detect_text | ||
|
212 changes: 212 additions & 0 deletions
212
scrapegraphai/screenshot_scraping/screenshot_preparation.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
import asyncio | ||
from playwright.async_api import async_playwright | ||
|
||
from io import BytesIO | ||
from PIL import Image, ImageGrab | ||
|
||
|
||
async def take_screenshot(url: str, save_path: str = None, quality: int = 100): | ||
""" | ||
Takes a screenshot of a webpage at the specified URL and saves it if the save_path is specified. | ||
Parameters: | ||
url (str): The URL of the webpage to take a screenshot of. | ||
save_path (str): The path to save the screenshot to. Defaults to None. | ||
quality (int): The quality of the jpeg image, between 1 and 100. Defaults to 100. | ||
Returns: | ||
PIL.Image: The screenshot of the webpage as a PIL Image object. | ||
""" | ||
|
||
async with async_playwright() as p: | ||
browser = await p.chromium.launch(headless=True) | ||
page = await browser.new_page() | ||
await page.goto(url) | ||
image_bytes = await page.screenshot(path=save_path, type="jpeg", full_page=True, quality=quality) | ||
await browser.close() | ||
return Image.open(BytesIO(image_bytes)) | ||
|
||
|
||
def select_area_with_opencv(image): | ||
""" | ||
Allows you to manually select an image area using OpenCV. It is recommended to use this function if your project is on your computer, otherwise use select_area_with_ipywidget(). | ||
Parameters: | ||
image (PIL.Image): The image from which to select an area. | ||
Returns: | ||
A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area. | ||
""" | ||
|
||
import cv2 as cv | ||
import numpy as np | ||
|
||
fullscreen_screenshot = ImageGrab.grab() | ||
dw, dh = fullscreen_screenshot.size | ||
|
||
def draw_selection_rectanlge(event, x, y, flags, param): | ||
global ix, iy, drawing, overlay, img | ||
if event == cv.EVENT_LBUTTONDOWN: | ||
drawing = True | ||
ix, iy = x, y | ||
elif event == cv.EVENT_MOUSEMOVE: | ||
if drawing == True: | ||
cv.rectangle(img, (ix, iy), (x, y), (41, 215, 162), -1) | ||
cv.putText(img, 'PRESS ANY KEY TO SELECT THIS AREA', (ix, | ||
iy-10), cv.FONT_HERSHEY_SIMPLEX, 1.5, (55, 46, 252), 5) | ||
img = cv.addWeighted(overlay, alpha, img, 1 - alpha, 0) | ||
elif event == cv.EVENT_LBUTTONUP: | ||
global LEFT, TOP, RIGHT, BOTTOM | ||
|
||
drawing = False | ||
if ix < x: | ||
LEFT = int(ix) | ||
RIGHT = int(x) | ||
else: | ||
LEFT = int(x) | ||
RIGHT = int(ix) | ||
if iy < y: | ||
TOP = int(iy) | ||
BOTTOM = int(y) | ||
else: | ||
TOP = int(y) | ||
BOTTOM = int(iy) | ||
|
||
global drawing, ix, iy, overlay, img | ||
drawing = False | ||
ix, iy = -1, -1 | ||
|
||
img = np.array(image) | ||
img = cv.cvtColor(img, cv.COLOR_RGB2BGR) | ||
|
||
img = cv.rectangle( | ||
img, (0, 0), (image.size[0], image.size[1]), (0, 0, 255), 10) | ||
img = cv.putText(img, 'SELECT AN AREA', (int( | ||
image.size[0]*0.3), 100), cv.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 5) | ||
|
||
overlay = img.copy() | ||
alpha = 0.3 | ||
|
||
while True: | ||
cv.namedWindow('SELECT AREA', cv.WINDOW_KEEPRATIO) | ||
cv.setMouseCallback('SELECT AREA', draw_selection_rectanlge) | ||
cv.resizeWindow('SELECT AREA', int( | ||
image.size[0]/(image.size[1]/dh)), dh) | ||
|
||
cv.imshow('SELECT AREA', img) | ||
|
||
if cv.waitKey(20) > -1: | ||
break | ||
|
||
cv.destroyAllWindows() | ||
return LEFT, TOP, RIGHT, BOTTOM | ||
|
||
|
||
def select_area_with_ipywidget(image): | ||
""" | ||
Allows you to manually select an image area using ipywidgets. It is recommended to use this function if your project is in Google Colab, Kaggle or other similar platform, otherwise use select_area_with_opencv(). | ||
Parameters: | ||
image (PIL Image): The input image. | ||
Returns: | ||
None | ||
""" | ||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
from ipywidgets import interact, IntSlider | ||
import ipywidgets as widgets | ||
from PIL import Image | ||
|
||
img_array = np.array(image) | ||
|
||
print(img_array.shape) | ||
|
||
def update_plot(top_bottom, left_right, image_size): | ||
plt.figure(figsize=(image_size, image_size)) | ||
plt.imshow(img_array) | ||
plt.axvline(x=left_right[0], color='blue', linewidth=1) | ||
plt.text(left_right[0]+1, -25, 'LEFT', rotation=90, color='blue') | ||
plt.axvline(x=left_right[1], color='red', linewidth=1) | ||
plt.text(left_right[1]+1, -25, 'RIGHT', rotation=90, color='red') | ||
|
||
plt.axhline(y=img_array.shape[0] - | ||
top_bottom[0], color='green', linewidth=1) | ||
plt.text(-100, img_array.shape[0] - | ||
top_bottom[0]+1, 'BOTTOM', color='green') | ||
plt.axhline(y=img_array.shape[0]-top_bottom[1], | ||
color='darkorange', linewidth=1) | ||
plt.text(-100, img_array.shape[0] - | ||
top_bottom[1]+1, 'TOP', color='darkorange') | ||
plt.axis('off') | ||
plt.show() | ||
|
||
top_bottom_slider = widgets.IntRangeSlider( | ||
value=[int(img_array.shape[0]*0.25), int(img_array.shape[0]*0.75)], | ||
min=0, | ||
max=img_array.shape[0], | ||
step=1, | ||
description='top_bottom:', | ||
disabled=False, | ||
continuous_update=True, | ||
orientation='vertical', | ||
readout=True, | ||
readout_format='d', | ||
) | ||
|
||
left_right_slider = widgets.IntRangeSlider( | ||
value=[int(img_array.shape[1]*0.25), int(img_array.shape[1]*0.75)], | ||
min=0, | ||
max=img_array.shape[1], | ||
step=1, | ||
description='left_right:', | ||
disabled=False, | ||
continuous_update=True, | ||
orientation='horizontal', | ||
readout=True, | ||
readout_format='d', | ||
) | ||
image_size_bt = widgets.BoundedIntText( | ||
value=10, | ||
min=2, | ||
max=20, | ||
step=1, | ||
description='Image size:', | ||
disabled=False | ||
) | ||
|
||
interact(update_plot, top_bottom=top_bottom_slider, | ||
left_right=left_right_slider, image_size=image_size_bt) | ||
|
||
return left_right_slider, top_bottom_slider | ||
|
||
|
||
def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path: str = None): | ||
""" | ||
Crop an image using the specified coordinates. | ||
Parameters: | ||
image (PIL.Image): The image to be cropped. | ||
LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None. | ||
TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None. | ||
RIGHT (int, optional): The x-coordinate of the right edge of the crop area. Defaults to None. | ||
BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None. | ||
save_path (str, optional): The path to save the cropped image. Defaults to None. | ||
Returns: | ||
PIL.Image: The cropped image. | ||
Notes: | ||
If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image. | ||
If save_path is specified, the cropped image will be saved as a JPEG file at the specified path. | ||
""" | ||
|
||
if LEFT is None: | ||
LEFT = 0 | ||
if TOP is None: | ||
TOP = 0 | ||
if RIGHT is None: | ||
RIGHT = image.size[0] | ||
if BOTTOM is None: | ||
BOTTOM = image.size[1] | ||
|
||
croped_image = image.crop((LEFT, TOP, RIGHT, BOTTOM)) | ||
if save_path is not None: | ||
from pathlib import Path | ||
croped_image.save(save_path, "JPEG") | ||
|
||
return image.crop((LEFT, TOP, RIGHT, BOTTOM)) | ||
|
||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from surya.ocr import run_ocr | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @VinciGit00, try and place as line 1: import typing_extensions |
||
import numpy as np | ||
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor | ||
from surya.model.recognition.model import load_model as load_rec_model | ||
from surya.model.recognition.processor import load_processor as load_rec_processor | ||
|
||
|
||
def detect_text(image, languages: list = ["en"]): | ||
""" | ||
Detects and extracts text from a given image. | ||
Parameters: | ||
image (PIL Image): The input image to extract text from. | ||
lahguages (list): A list of languages to detect text in. Defaults to ["en"]. List of languages can be found here: https://github.com/VikParuchuri/surya/blob/master/surya/languages.py | ||
Returns: | ||
str: The extracted text from the image. | ||
Notes: | ||
Model weights will automatically download the first time you run this function. | ||
""" | ||
|
||
langs = languages | ||
det_processor, det_model = load_det_processor(), load_det_model() | ||
rec_model, rec_processor = load_rec_model(), load_rec_processor() | ||
predictions = run_ocr([image], [langs], det_model, | ||
det_processor, rec_model, rec_processor) | ||
|
||
text = "\n".join([line.text for line in predictions[0].text_lines]) | ||
return text | ||
|
||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, why the uppercase arguments?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In a nutshell, 3 out of 5 functions that I added are part of my other project and I just forgot to change it.