|
1 |
| -"""" |
2 |
| -chromium module |
| 1 | +""" |
| 2 | +chromiumloader module |
3 | 3 | """
|
4 | 4 | import asyncio
|
5 | 5 | from typing import Any, AsyncIterator, Iterator, List, Optional
|
@@ -83,7 +83,7 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
|
83 | 83 | async with async_timeout.timeout(self.TIMEOUT):
|
84 | 84 | driver = uc.Chrome(headless=self.headless)
|
85 | 85 | driver.get(url)
|
86 |
| - results = driver.page_content |
| 86 | + results = driver.page_source |
87 | 87 | logger.info(f"Successfully scraped {url}")
|
88 | 88 | break
|
89 | 89 | except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
@@ -137,6 +137,45 @@ async def ascrape_playwright(self, url: str) -> str:
|
137 | 137 |
|
138 | 138 | return results
|
139 | 139 |
|
| 140 | + async def ascrape_with_js_support(self, url: str) -> str: |
| 141 | + """ |
| 142 | + Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright. |
| 143 | +
|
| 144 | + Args: |
| 145 | + url (str): The URL to scrape. |
| 146 | +
|
| 147 | + Returns: |
| 148 | + str: The fully rendered HTML content after JavaScript execution, |
| 149 | + or an error message if an exception occurs. |
| 150 | + """ |
| 151 | + from playwright.async_api import async_playwright |
| 152 | + |
| 153 | + logger.info(f"Starting scraping with JavaScript support for {url}...") |
| 154 | + results = "" |
| 155 | + attempt = 0 |
| 156 | + |
| 157 | + while attempt < self.RETRY_LIMIT: |
| 158 | + try: |
| 159 | + async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT): |
| 160 | + browser = await p.chromium.launch( |
| 161 | + headless=self.headless, proxy=self.proxy, **self.browser_config |
| 162 | + ) |
| 163 | + context = await browser.new_context() |
| 164 | + page = await context.new_page() |
| 165 | + await page.goto(url, wait_until="networkidle") |
| 166 | + results = await page.content() |
| 167 | + logger.info("Content scraped after JavaScript rendering") |
| 168 | + break |
| 169 | + except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e: |
| 170 | + attempt += 1 |
| 171 | + logger.error(f"Attempt {attempt} failed: {e}") |
| 172 | + if attempt == self.RETRY_LIMIT: |
| 173 | + results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}" |
| 174 | + finally: |
| 175 | + await browser.close() |
| 176 | + |
| 177 | + return results |
| 178 | + |
140 | 179 | def lazy_load(self) -> Iterator[Document]:
|
141 | 180 | """
|
142 | 181 | Lazily load text content from the provided URLs.
|
|
0 commit comments