Skip to content

Commit 402e5c4

Browse files
authored
Merge pull request #730 from ScrapeGraphAI/refactoring-of-pr-#729
2 parents 95d00e9 + eb42c44 commit 402e5c4

File tree

3 files changed

+71
-2
lines changed

3 files changed

+71
-2
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
groq_key = os.getenv("GROQ_APIKEY")
17+
18+
graph_config = {
19+
"llm": {
20+
"model": "groq/gemma-7b-it",
21+
"api_key": groq_key,
22+
"temperature": 0
23+
},
24+
"headless": False,
25+
"backend": "undetected_chromedriver"
26+
}
27+
28+
# ************************************************
29+
# Create the SmartScraperGraph instance and run it
30+
# ************************************************
31+
32+
smart_scraper_graph = SmartScraperGraph(
33+
prompt="List me all the projects with their description.",
34+
# also accepts a string with the already downloaded HTML code
35+
source="https://perinim.github.io/projects/",
36+
config=graph_config
37+
)
38+
39+
result = smart_scraper_graph.run()
40+
print(result)
41+
42+
# ************************************************
43+
# Get graph execution info
44+
# ************************************************
45+
46+
graph_exec_info = smart_scraper_graph.get_execution_info()
47+
print(prettify_exec_info(graph_exec_info))

examples/groq/smart_scraper_groq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"api_key": groq_key,
2222
"temperature": 0
2323
},
24-
"headless": False
24+
"headless": False,
2525
}
2626

2727
# ************************************************

scrapegraphai/docloaders/chromium.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,28 @@ def __init__(
6161
self.urls = urls
6262
self.load_state = load_state
6363

64+
async def ascrape_undetected_chromedriver(self, url: str) -> str:
65+
"""
66+
Asynchronously scrape the content of a given URL using undetected chrome with Selenium.
67+
68+
Args:
69+
url (str): The URL to scrape.
70+
71+
Returns:
72+
str: The scraped HTML content or an error message if an exception occurs.
73+
74+
"""
75+
import undetected_chromedriver as uc
76+
77+
logger.info(f"Starting scraping with {self.backend}...")
78+
results = ""
79+
try:
80+
driver = uc.Chrome(headless=self.headless)
81+
results = driver.get(url).page_content
82+
except Exception as e:
83+
results = f"Error: {e}"
84+
return results
85+
6486
async def ascrape_playwright(self, url: str) -> str:
6587
"""
6688
Asynchronously scrape the content of a given URL using Playwright's async API.
@@ -75,7 +97,7 @@ async def ascrape_playwright(self, url: str) -> str:
7597
from playwright.async_api import async_playwright
7698
from undetected_playwright import Malenia
7799

78-
logger.info("Starting scraping...")
100+
logger.info(f"Starting scraping with {self.backend}...")
79101
results = ""
80102
async with async_playwright() as p:
81103
browser = await p.chromium.launch(

0 commit comments

Comments
 (0)