Skip to content

Commit 72f16ab

Browse files
authored
Merge pull request #736 from ScrapeGraphAI/refractoring-chromiumloader
2 parents 44d10aa + 88ba231 commit 72f16ab

File tree

1 file changed

+86
-23
lines changed

1 file changed

+86
-23
lines changed

scrapegraphai/docloaders/chromium.py

Lines changed: 86 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
"""
2-
Chromium module
2+
chromiumloader module
33
"""
44
import asyncio
55
from typing import Any, AsyncIterator, Iterator, List, Optional
66
from langchain_community.document_loaders.base import BaseLoader
77
from langchain_core.documents import Document
8+
import aiohttp
9+
import async_timeout
810
from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy
911

1012
logger = get_logger("web-loader")
@@ -21,6 +23,9 @@ class ChromiumLoader(BaseLoader):
2123
urls: A list of URLs to scrape content from.
2224
"""
2325

26+
RETRY_LIMIT = 3
27+
TIMEOUT = 10
28+
2429
def __init__(
2530
self,
2631
urls: List[str],
@@ -66,17 +71,29 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
6671
6772
Returns:
6873
str: The scraped HTML content or an error message if an exception occurs.
69-
7074
"""
7175
import undetected_chromedriver as uc
7276

7377
logger.info(f"Starting scraping with {self.backend}...")
7478
results = ""
75-
try:
76-
driver = uc.Chrome(headless=self.headless)
77-
results = driver.get(url).page_content
78-
except Exception as e:
79-
results = f"Error: {e}"
79+
attempt = 0
80+
81+
while attempt < self.RETRY_LIMIT:
82+
try:
83+
async with async_timeout.timeout(self.TIMEOUT):
84+
driver = uc.Chrome(headless=self.headless)
85+
driver.get(url)
86+
results = driver.page_source
87+
logger.info(f"Successfully scraped {url}")
88+
break
89+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
90+
attempt += 1
91+
logger.error(f"Attempt {attempt} failed: {e}")
92+
if attempt == self.RETRY_LIMIT:
93+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
94+
finally:
95+
driver.quit()
96+
8097
return results
8198

8299
async def ascrape_playwright(self, url: str) -> str:
@@ -88,28 +105,75 @@ async def ascrape_playwright(self, url: str) -> str:
88105
89106
Returns:
90107
str: The scraped HTML content or an error message if an exception occurs.
91-
92108
"""
93109
from playwright.async_api import async_playwright
94110
from undetected_playwright import Malenia
95111

96112
logger.info(f"Starting scraping with {self.backend}...")
97113
results = ""
98-
async with async_playwright() as p:
99-
browser = await p.chromium.launch(
100-
headless=self.headless, proxy=self.proxy, **self.browser_config
101-
)
114+
attempt = 0
115+
116+
while attempt < self.RETRY_LIMIT:
117+
try:
118+
async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
119+
browser = await p.chromium.launch(
120+
headless=self.headless, proxy=self.proxy, **self.browser_config
121+
)
122+
context = await browser.new_context()
123+
await Malenia.apply_stealth(context)
124+
page = await context.new_page()
125+
await page.goto(url, wait_until="domcontentloaded")
126+
await page.wait_for_load_state(self.load_state)
127+
results = await page.content()
128+
logger.info("Content scraped")
129+
break
130+
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
131+
attempt += 1
132+
logger.error(f"Attempt {attempt} failed: {e}")
133+
if attempt == self.RETRY_LIMIT:
134+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
135+
finally:
136+
await browser.close()
137+
138+
return results
139+
140+
async def ascrape_with_js_support(self, url: str) -> str:
141+
"""
142+
Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
143+
144+
Args:
145+
url (str): The URL to scrape.
146+
147+
Returns:
148+
str: The fully rendered HTML content after JavaScript execution,
149+
or an error message if an exception occurs.
150+
"""
151+
from playwright.async_api import async_playwright
152+
153+
logger.info(f"Starting scraping with JavaScript support for {url}...")
154+
results = ""
155+
attempt = 0
156+
157+
while attempt < self.RETRY_LIMIT:
102158
try:
103-
context = await browser.new_context()
104-
await Malenia.apply_stealth(context)
105-
page = await context.new_page()
106-
await page.goto(url, wait_until="domcontentloaded")
107-
await page.wait_for_load_state(self.load_state)
108-
results = await page.content() # Simply get the HTML content
109-
logger.info("Content scraped")
110-
except Exception as e:
111-
results = f"Error: {e}"
112-
await browser.close()
159+
async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
160+
browser = await p.chromium.launch(
161+
headless=self.headless, proxy=self.proxy, **self.browser_config
162+
)
163+
context = await browser.new_context()
164+
page = await context.new_page()
165+
await page.goto(url, wait_until="networkidle")
166+
results = await page.content()
167+
logger.info("Content scraped after JavaScript rendering")
168+
break
169+
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
170+
attempt += 1
171+
logger.error(f"Attempt {attempt} failed: {e}")
172+
if attempt == self.RETRY_LIMIT:
173+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
174+
finally:
175+
await browser.close()
176+
113177
return results
114178

115179
def lazy_load(self) -> Iterator[Document]:
@@ -121,7 +185,6 @@ def lazy_load(self) -> Iterator[Document]:
121185
122186
Yields:
123187
Document: The scraped content encapsulated within a Document object.
124-
125188
"""
126189
scraping_fn = getattr(self, f"ascrape_{self.backend}")
127190

0 commit comments

Comments
 (0)