Skip to content

Commit 7b3ee4e

Browse files
committed
feat(docloaders): undetected-playwright
1 parent ae9986a commit 7b3ee4e

File tree

4 files changed

+7
-3
lines changed

4 files changed

+7
-3
lines changed

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph
3232
examples/**/result.csv
3333
examples/**/result.json
3434
main.py
35-
36-
35+
.idea

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ dependencies = [
3030
"playwright==1.43.0",
3131
"google==3.0.0",
3232
"yahoo-search-py==0.3",
33+
"undetected-playwright==0.3.0",
3334
]
3435

3536
license = "MIT"

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ langchain-aws==0.1.2
1919
langchain-anthropic==0.1.11
2020
yahoo-search-py==0.3
2121
pypdf==4.2.0
22+
undetected-playwright==0.3.0

scrapegraphai/docloaders/chromium.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ async def ascrape_playwright(self, url: str) -> str:
6969
7070
"""
7171
from playwright.async_api import async_playwright
72+
from undetected_playwright import Malenia
7273

7374
logger.info("Starting scraping...")
7475
results = ""
@@ -77,7 +78,9 @@ async def ascrape_playwright(self, url: str) -> str:
7778
headless=self.headless, proxy=self.proxy, **self.browser_config
7879
)
7980
try:
80-
page = await browser.new_page()
81+
context = await browser.new_context()
82+
await Malenia.apply_stealth(context)
83+
page = await context.new_page()
8184
await page.goto(url)
8285
results = await page.content() # Simply get the HTML content
8386
logger.info("Content scraped")

0 commit comments

Comments
 (0)