Merge pull request #730 from ScrapeGraphAI/refactoring-of-pr-#729

VinciGit00 · web-flow · commit 402e5c420dcb · 2024-10-08T08:24:12.000+02:00
diff --git a/examples/extras/undected_playwrigth.py b/examples/extras/undected_playwrigth.py
@@ -0,0 +1,47 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "groq/gemma-7b-it",
+        "api_key": groq_key,
+        "temperature": 0
+    },
+    "headless": False,
+    "backend": "undetected_chromedriver"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me all the projects with their description.",
+    # also accepts a string with the already downloaded HTML code
+    source="https://perinim.github.io/projects/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq.py
@@ -21,7 +21,7 @@
         "api_key": groq_key,
         "temperature": 0
     },
-    "headless": False
+    "headless": False,
 }
 
 # ************************************************
diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -61,6 +61,28 @@ def __init__(
         self.urls = urls
         self.load_state = load_state
 
+    async def ascrape_undetected_chromedriver(self, url: str) -> str:
+        """
+        Asynchronously scrape the content of a given URL using undetected chrome with Selenium.
+
+        Args:
+            url (str): The URL to scrape.
+
+        Returns:
+            str: The scraped HTML content or an error message if an exception occurs.
+
+        """
+        import undetected_chromedriver as uc
+
+        logger.info(f"Starting scraping with {self.backend}...")
+        results = ""
+        try:
+            driver = uc.Chrome(headless=self.headless)
+            results = driver.get(url).page_content
+        except Exception as e:
+            results = f"Error: {e}"
+        return results
+
     async def ascrape_playwright(self, url: str) -> str:
         """
         Asynchronously scrape the content of a given URL using Playwright's async API.
@@ -75,7 +97,7 @@ async def ascrape_playwright(self, url: str) -> str:
         from playwright.async_api import async_playwright
         from undetected_playwright import Malenia
 
-        logger.info("Starting scraping...")
+        logger.info(f"Starting scraping with {self.backend}...")
         results = ""
         async with async_playwright() as p:
             browser = await p.chromium.launch(

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`	`"api_key": groq_key,`
`22`	`22`	`"temperature": 0`
`23`	`23`	`},`
`24`		`- "headless": False`
	`24`	`+ "headless": False,`
`25`	`25`	`}`
`26`	`26`
`27`	`27`	`# ************************************************`