Merge pull request #368 from stevenmichaelthomas/wait-for-network-idle

VinciGit00 · web-flow · commit fa951b4c8bed · 2024-06-11T19:59:23.000+02:00
Add the ability to specify load state
diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -29,6 +29,7 @@ def __init__(
         backend: str = "playwright",
         headless: bool = True,
         proxy: Optional[Proxy] = None,
+        load_state: str = "domcontentloaded",
         **kwargs: Any,
     ):
         """Initialize the loader with a list of URL paths.
@@ -55,6 +56,7 @@ def __init__(
         self.headless = headless
         self.proxy = parse_or_search_proxy(proxy) if proxy else None
         self.urls = urls
+        self.load_state = load_state
 
     async def ascrape_playwright(self, url: str) -> str:
         """
@@ -81,6 +83,7 @@ async def ascrape_playwright(self, url: str) -> str:
                 await Malenia.apply_stealth(context)
                 page = await context.new_page()
                 await page.goto(url)
+                await page.wait_for_load_state(self.load_state)
                 results = await page.content()  # Simply get the HTML content
                 logger.info("Content scraped")
             except Exception as e:
diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py
@@ -83,6 +83,9 @@ def execute(self, state: dict) -> dict:
             
             Assume relevance broadly, including any links that might be related or potentially useful 
             in relation to the task.
+
+            Sort it in order of importance, the first one should be the most important one, the last one
+            the least important
             
             Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain 
             whether the content at the link is directly relevant.