feat(docloaders): undetected-playwright

QIN2DIM · QIN2DIM · commit 7b3ee4e71e4a · 2024-05-19T18:01:03.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph
 examples/**/result.csv
 examples/**/result.json
 main.py
-
- 
+.idea
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     "playwright==1.43.0",
     "google==3.0.0",
     "yahoo-search-py==0.3",
+    "undetected-playwright==0.3.0",
 ]
 
 license = "MIT"
diff --git a/requirements.txt b/requirements.txt
@@ -19,3 +19,4 @@ langchain-aws==0.1.2
 langchain-anthropic==0.1.11 
 yahoo-search-py==0.3
 pypdf==4.2.0
+undetected-playwright==0.3.0
diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -69,6 +69,7 @@ async def ascrape_playwright(self, url: str) -> str:
 
         """
         from playwright.async_api import async_playwright
+        from undetected_playwright import Malenia
 
         logger.info("Starting scraping...")
         results = ""
@@ -77,7 +78,9 @@ async def ascrape_playwright(self, url: str) -> str:
                 headless=self.headless, proxy=self.proxy, **self.browser_config
             )
             try:
-                page = await browser.new_page()
+                context = await browser.new_context()
+                await Malenia.apply_stealth(context)
+                page = await context.new_page()
                 await page.goto(url)
                 results = await page.content()  # Simply get the HTML content
                 logger.info("Content scraped")