Skip to content

Commit 88ba231

Browse files
committed
feat:add dynamic rendering
1 parent 4f816f3 commit 88ba231

File tree

1 file changed

+42
-3
lines changed

1 file changed

+42
-3
lines changed

scrapegraphai/docloaders/chromium.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
""""
2-
chromium module
1+
"""
2+
chromiumloader module
33
"""
44
import asyncio
55
from typing import Any, AsyncIterator, Iterator, List, Optional
@@ -83,7 +83,7 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
8383
async with async_timeout.timeout(self.TIMEOUT):
8484
driver = uc.Chrome(headless=self.headless)
8585
driver.get(url)
86-
results = driver.page_content
86+
results = driver.page_source
8787
logger.info(f"Successfully scraped {url}")
8888
break
8989
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
@@ -137,6 +137,45 @@ async def ascrape_playwright(self, url: str) -> str:
137137

138138
return results
139139

140+
async def ascrape_with_js_support(self, url: str) -> str:
141+
"""
142+
Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
143+
144+
Args:
145+
url (str): The URL to scrape.
146+
147+
Returns:
148+
str: The fully rendered HTML content after JavaScript execution,
149+
or an error message if an exception occurs.
150+
"""
151+
from playwright.async_api import async_playwright
152+
153+
logger.info(f"Starting scraping with JavaScript support for {url}...")
154+
results = ""
155+
attempt = 0
156+
157+
while attempt < self.RETRY_LIMIT:
158+
try:
159+
async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
160+
browser = await p.chromium.launch(
161+
headless=self.headless, proxy=self.proxy, **self.browser_config
162+
)
163+
context = await browser.new_context()
164+
page = await context.new_page()
165+
await page.goto(url, wait_until="networkidle")
166+
results = await page.content()
167+
logger.info("Content scraped after JavaScript rendering")
168+
break
169+
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
170+
attempt += 1
171+
logger.error(f"Attempt {attempt} failed: {e}")
172+
if attempt == self.RETRY_LIMIT:
173+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
174+
finally:
175+
await browser.close()
176+
177+
return results
178+
140179
def lazy_load(self) -> Iterator[Document]:
141180
"""
142181
Lazily load text content from the provided URLs.

0 commit comments

Comments
 (0)