Skip to content

Commit 768719c

Browse files
committed
feat(safe-web-driver): enchanced the original AsyncChromiumLoader web driver with proxy protection and flexible kwargs and backend
the original class prevents passing kwargs down to the playwright backend, making some config unfeasible, including passing a proxy server to the web driver. the new class has backward compatibility with the original, but 1) allows any kwarg to be passed down to the web driver, 2) allows specifying the web driver backend (only playwright is supported for now) in case more (e.g., selenium) will be supported in the future and 3) automatically fetches a suitable proxy if one is not passed already
1 parent 2170131 commit 768719c

File tree

3 files changed

+180
-40
lines changed

3 files changed

+180
-40
lines changed

scrapegraphai/docloaders/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""__init__.py file for docloaders folder"""
2+
3+
from .chromium import ChromiumLoader

scrapegraphai/docloaders/chromium.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import asyncio
2+
import logging
3+
from typing import Any, AsyncIterator, Iterator, List, Optional
4+
5+
from langchain_core.documents import Document
6+
7+
from ..utils import Proxy, dynamic_import, parse_or_search_proxy
8+
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
class ChromiumLoader:
14+
"""scrapes HTML pages from URLs using a (headless) instance of the
15+
Chromium web driver with proxy protection
16+
17+
Attributes:
18+
backend: The web driver backend library; defaults to 'playwright'.
19+
browser_config: A dictionary containing additional browser kwargs.
20+
headless: whether to run browser in headless mode.
21+
proxy: A dictionary containing proxy settings; None disables protection.
22+
urls: A list of URLs to scrape content from.
23+
"""
24+
25+
def __init__(
26+
self,
27+
urls: List[str],
28+
*,
29+
backend: str = "playwright",
30+
headless: bool = True,
31+
proxy: Optional[Proxy] = None,
32+
**kwargs: Any,
33+
):
34+
"""Initialize the loader with a list of URL paths.
35+
36+
Args:
37+
backend: The web driver backend library; defaults to 'playwright'.
38+
headless: whether to run browser in headless mode.
39+
proxy: A dictionary containing proxy information; None disables protection.
40+
urls: A list of URLs to scrape content from.
41+
kwargs: A dictionary containing additional browser kwargs.
42+
43+
Raises:
44+
ImportError: If the required backend package is not installed.
45+
"""
46+
message = (
47+
f"{backend} is required for ChromiumLoader. "
48+
f"Please install it with `pip install {backend}`."
49+
)
50+
51+
dynamic_import(backend, message)
52+
53+
self.backend = backend
54+
self.browser_config = kwargs
55+
self.headless = headless
56+
self.proxy = parse_or_search_proxy(proxy) if proxy else None
57+
self.urls = urls
58+
59+
async def ascrape_playwright(self, url: str) -> str:
60+
"""
61+
Asynchronously scrape the content of a given URL using Playwright's async API.
62+
63+
Args:
64+
url (str): The URL to scrape.
65+
66+
Returns:
67+
str: The scraped HTML content or an error message if an exception occurs.
68+
69+
"""
70+
from playwright.async_api import async_playwright
71+
72+
logger.info("Starting scraping...")
73+
results = ""
74+
async with async_playwright() as p:
75+
browser = await p.chromium.launch(
76+
headless=self.headless, proxy=self.proxy, **self.browser_config
77+
)
78+
try:
79+
page = await browser.new_page()
80+
await page.goto(url)
81+
results = await page.content() # Simply get the HTML content
82+
logger.info("Content scraped")
83+
except Exception as e:
84+
results = f"Error: {e}"
85+
await browser.close()
86+
return results
87+
88+
def lazy_load(self) -> Iterator[Document]:
89+
"""
90+
Lazily load text content from the provided URLs.
91+
92+
This method yields Documents one at a time as they're scraped,
93+
instead of waiting to scrape all URLs before returning.
94+
95+
Yields:
96+
Document: The scraped content encapsulated within a Document object.
97+
98+
"""
99+
scraping_fn = getattr(self, f"ascrape_{self.backend}")
100+
101+
for url in self.urls:
102+
html_content = asyncio.run(scraping_fn(url))
103+
metadata = {"source": url}
104+
yield Document(page_content=html_content, metadata=metadata)
105+
106+
async def alazy_load(self) -> AsyncIterator[Document]:
107+
"""
108+
Asynchronously load text content from the provided URLs.
109+
110+
This method leverages asyncio to initiate the scraping of all provided URLs
111+
simultaneously. It improves performance by utilizing concurrent asynchronous
112+
requests. Each Document is yielded as soon as its content is available,
113+
encapsulating the scraped content.
114+
115+
Yields:
116+
Document: A Document object containing the scraped content, along with its
117+
source URL as metadata.
118+
"""
119+
scraping_fn = getattr(self, f"ascrape_{self.backend}")
120+
121+
tasks = [scraping_fn(url) for url in self.urls]
122+
results = await asyncio.gather(*tasks)
123+
for url, content in zip(self.urls, results):
124+
metadata = {"source": url}
125+
yield Document(page_content=content, metadata=metadata)

scrapegraphai/nodes/fetch_node.py

Lines changed: 52 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,24 @@
11
"""
22
FetchNode Module
33
"""
4-
import pandas as pd
4+
55
import json
66
from typing import List, Optional
7-
from langchain_community.document_loaders import AsyncChromiumLoader
8-
from langchain_core.documents import Document
7+
8+
import pandas as pd
99
from langchain_community.document_loaders import PyPDFLoader
10-
from .base_node import BaseNode
10+
from langchain_core.documents import Document
11+
12+
from ..docloaders import ChromiumLoader
1113
from ..utils.remover import remover
14+
from .base_node import BaseNode
1215

1316

1417
class FetchNode(BaseNode):
1518
"""
1619
A node responsible for fetching the HTML content of a specified URL and updating
17-
the graph's state with this content. It uses the AsyncChromiumLoader to fetch the
18-
content asynchronously.
20+
the graph's state with this content. It uses ChromiumLoader to fetch
21+
the content from a web page asynchronously (with proxy protection).
1922
2023
This node acts as a starting point in many scraping workflows, preparing the state
2124
with the necessary HTML content for further processing by subsequent nodes in the graph.
@@ -31,13 +34,21 @@ class FetchNode(BaseNode):
3134
node_name (str): The unique identifier name for the node, defaulting to "Fetch".
3235
"""
3336

34-
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
37+
def __init__(
38+
self,
39+
input: str,
40+
output: List[str],
41+
node_config: Optional[dict] = None,
42+
node_name: str = "Fetch",
43+
):
3544
super().__init__(node_name, "node", input, output, 1)
3645

37-
self.headless = True if node_config is None else node_config.get(
38-
"headless", True)
39-
self.verbose = False if node_config is None else node_config.get(
40-
"verbose", False)
46+
self.headless = (
47+
True if node_config is None else node_config.get("headless", True)
48+
)
49+
self.verbose = (
50+
False if node_config is None else node_config.get("verbose", False)
51+
)
4152

4253
def execute(self, state):
4354
"""
@@ -64,10 +75,14 @@ def execute(self, state):
6475
input_data = [state[key] for key in input_keys]
6576

6677
source = input_data[0]
67-
if self.input == "json_dir" or self.input == "xml_dir" or self.input == "csv_dir":
68-
compressed_document = [Document(page_content=source, metadata={
69-
"source": "local_dir"
70-
})]
78+
if (
79+
self.input == "json_dir"
80+
or self.input == "xml_dir"
81+
or self.input == "csv_dir"
82+
):
83+
compressed_document = [
84+
Document(page_content=source, metadata={"source": "local_dir"})
85+
]
7186
# if it is a local directory
7287

7388
# handling for pdf
@@ -76,45 +91,42 @@ def execute(self, state):
7691
compressed_document = loader.load()
7792

7893
elif self.input == "csv":
79-
compressed_document = [Document(page_content=str(pd.read_csv(source)), metadata={
80-
"source": "csv"
81-
})]
94+
compressed_document = [
95+
Document(
96+
page_content=str(pd.read_csv(source)), metadata={"source": "csv"}
97+
)
98+
]
8299
elif self.input == "json":
83100
f = open(source)
84-
compressed_document = [Document(page_content=str(json.load(f)), metadata={
85-
"source": "json"
86-
})]
101+
compressed_document = [
102+
Document(page_content=str(json.load(f)), metadata={"source": "json"})
103+
]
87104
elif self.input == "xml":
88-
with open(source, 'r', encoding='utf-8') as f:
105+
with open(source, "r", encoding="utf-8") as f:
89106
data = f.read()
90-
compressed_document = [Document(page_content=data, metadata={
91-
"source": "xml"
92-
})]
107+
compressed_document = [
108+
Document(page_content=data, metadata={"source": "xml"})
109+
]
93110
elif self.input == "pdf_dir":
94111
pass
95112

96113
elif not source.startswith("http"):
97-
compressed_document = [Document(page_content=remover(source), metadata={
98-
"source": "local_dir"
99-
})]
114+
compressed_document = [
115+
Document(page_content=remover(source), metadata={"source": "local_dir"})
116+
]
100117

101118
else:
102-
if self.node_config is not None and self.node_config.get("endpoint") is not None:
119+
loader_kwargs = {}
103120

104-
loader = AsyncChromiumLoader(
105-
[source],
106-
proxies={"http": self.node_config["endpoint"]},
107-
headless=self.headless,
108-
)
109-
else:
110-
loader = AsyncChromiumLoader(
111-
[source],
112-
headless=self.headless,
113-
)
121+
if self.node_config is not None:
122+
loader_kwargs = self.node_config.get("loader_kwargs", {})
123+
124+
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
114125

115126
document = loader.load()
116127
compressed_document = [
117-
Document(page_content=remover(str(document[0].page_content)))]
128+
Document(page_content=remover(str(document[0].page_content)))
129+
]
118130

119131
state.update({self.output[0]: compressed_document})
120132
return state

0 commit comments

Comments
 (0)