Skip to content

Commit fa951b4

Browse files
authored
Merge pull request #368 from stevenmichaelthomas/wait-for-network-idle
Add the ability to specify load state
2 parents e5bb5ae + 8f405ff commit fa951b4

File tree

2 files changed

+6
-0
lines changed

2 files changed

+6
-0
lines changed

scrapegraphai/docloaders/chromium.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def __init__(
2929
backend: str = "playwright",
3030
headless: bool = True,
3131
proxy: Optional[Proxy] = None,
32+
load_state: str = "domcontentloaded",
3233
**kwargs: Any,
3334
):
3435
"""Initialize the loader with a list of URL paths.
@@ -55,6 +56,7 @@ def __init__(
5556
self.headless = headless
5657
self.proxy = parse_or_search_proxy(proxy) if proxy else None
5758
self.urls = urls
59+
self.load_state = load_state
5860

5961
async def ascrape_playwright(self, url: str) -> str:
6062
"""
@@ -81,6 +83,7 @@ async def ascrape_playwright(self, url: str) -> str:
8183
await Malenia.apply_stealth(context)
8284
page = await context.new_page()
8385
await page.goto(url)
86+
await page.wait_for_load_state(self.load_state)
8487
results = await page.content() # Simply get the HTML content
8588
logger.info("Content scraped")
8689
except Exception as e:

scrapegraphai/nodes/search_link_node.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ def execute(self, state: dict) -> dict:
8383
8484
Assume relevance broadly, including any links that might be related or potentially useful
8585
in relation to the task.
86+
87+
Sort it in order of importance, the first one should be the most important one, the last one
88+
the least important
8689
8790
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
8891
whether the content at the link is directly relevant.

0 commit comments

Comments
 (0)