Skip to content

Commit 9016bb5

Browse files
authored
Merge pull request #639 from ScrapeGraphAI/scrape_do_integration
Scrape do integration
2 parents af28885 + 167f970 commit 9016bb5

File tree

7 files changed

+109
-6
lines changed

7 files changed

+109
-6
lines changed

examples/extras/browser_base_integration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
graph_config = {
1919
"llm": {
2020
"api_key": os.getenv("OPENAI_API_KEY"),
21-
"model": "gpt-4o",
21+
"model": "openai/gpt-4o",
2222
},
2323
"browser_base": {
2424
"api_key": os.getenv("BROWSER_BASE_API_KEY"),

examples/extras/scrape_do.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
17+
graph_config = {
18+
"llm": {
19+
"api_key": os.getenv("OPENAI_API_KEY"),
20+
"model": "openai/gpt-4o",
21+
},
22+
"scrape_do": {
23+
"api_key": os.getenv("SCRAPE_DO_API_KEY"),
24+
},
25+
"verbose": True,
26+
"headless": False,
27+
}
28+
29+
# ************************************************
30+
# Create the SmartScraperGraph instance and run it
31+
# ************************************************
32+
33+
smart_scraper_graph = SmartScraperGraph(
34+
prompt="List me all the projects",
35+
source="https://perinim.github.io/projects/",
36+
config=graph_config
37+
)
38+
39+
result = smart_scraper_graph.run()
40+
print(json.dumps(result, indent=4))

scrapegraphai/docloaders/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22

33
from .chromium import ChromiumLoader
44
from .browser_base import browser_base_fetch
5+
from .scrape_do import scrape_do_fetch

scrapegraphai/docloaders/scrape_do.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""
2+
Scrape_do module
3+
"""
4+
import urllib.parse
5+
import requests
6+
import urllib3
7+
8+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
9+
10+
def scrape_do_fetch(token, target_url, use_proxy=False, geoCode=None, super_proxy=False):
11+
"""
12+
Fetches the IP address of the machine associated with the given URL using Scrape.do.
13+
14+
Args:
15+
token (str): The API token for Scrape.do service.
16+
target_url (str): A valid web page URL to fetch its associated IP address.
17+
use_proxy (bool): Whether to use Scrape.do proxy mode. Default is False.
18+
geoCode (str, optional): Specify the country code for
19+
geolocation-based proxies. Default is None.
20+
super_proxy (bool): If True, use Residential & Mobile Proxy Networks. Default is False.
21+
22+
Returns:
23+
str: The raw response from the target URL.
24+
"""
25+
encoded_url = urllib.parse.quote(target_url)
26+
if use_proxy:
27+
# Create proxy mode URL
28+
proxyModeUrl = f"http://{token}:@proxy.scrape.do:8080"
29+
proxies = {
30+
"http": proxyModeUrl,
31+
"https": proxyModeUrl,
32+
}
33+
# Add optional geoCode and super proxy parameters if provided
34+
params = {"geoCode": geoCode, "super": str(super_proxy).lower()} if geoCode else {}
35+
response = requests.get(target_url, proxies=proxies, verify=False, params=params)
36+
else:
37+
# API Mode URL
38+
url = f"http://api.scrape.do?token={token}&url={encoded_url}"
39+
response = requests.get(url)
40+
41+
return response.text

scrapegraphai/graphs/abstract_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def __init__(self, prompt: str, config: dict,
6363
self.loader_kwargs = self.config.get("loader_kwargs", {})
6464
self.cache_path = self.config.get("cache_path", False)
6565
self.browser_base = self.config.get("browser_base")
66+
self.scrape_do = self.config.get("scrape_do")
6667

6768
self.graph = self._create_graph()
6869
self.final_state = None

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ def _create_graph(self) -> BaseGraph:
6767
"force": self.config.get("force", False),
6868
"cut": self.config.get("cut", True),
6969
"loader_kwargs": self.config.get("loader_kwargs", {}),
70-
"browser_base": self.config.get("browser_base")
70+
"browser_base": self.config.get("browser_base"),
71+
"scrape_do": self.config.get("scrape_do")
7172
}
7273
)
7374
parse_node = ParseNode(

scrapegraphai/nodes/fetch_node.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ def __init__(
7676
None if node_config is None else node_config.get("browser_base", None)
7777
)
7878

79+
self.scrape_do = (
80+
None if node_config is None else node_config.get("scrape_do", None)
81+
)
82+
7983
def execute(self, state):
8084
"""
8185
Executes the node's logic to fetch HTML content from a specified URL and
@@ -102,7 +106,7 @@ def execute(self, state):
102106

103107
source = input_data[0]
104108
input_type = input_keys[0]
105-
109+
106110
handlers = {
107111
"json_dir": self.handle_directory,
108112
"xml_dir": self.handle_directory,
@@ -271,19 +275,34 @@ def handle_web_source(self, state, source):
271275
try:
272276
from ..docloaders.browser_base import browser_base_fetch
273277
except ImportError:
274-
raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.")
278+
raise ImportError("""The browserbase module is not installed.
279+
Please install it using `pip install browserbase`.""")
275280

276281
data = browser_base_fetch(self.browser_base.get("api_key"),
277282
self.browser_base.get("project_id"), [source])
278283

279284
document = [Document(page_content=content,
280285
metadata={"source": source}) for content in data]
286+
elif self.scrape_do is not None:
287+
from ..docloaders.scrape_do import scrape_do_fetch
288+
if self.scrape_do.get("use_proxy") is None or self.scrape_do.get("geoCode") is None or self.scrape_do.get("super_proxy") is None:
289+
data = scrape_do_fetch(self.scrape_do.get("api_key"),
290+
source)
291+
else:
292+
data = scrape_do_fetch(self.scrape_do.get("api_key"),
293+
source, self.scrape_do.get("use_proxy"),
294+
self.scrape_do.get("geoCode"),
295+
self.scrape_do.get("super_proxy"))
296+
297+
document = [Document(page_content=data,
298+
metadata={"source": source})]
281299
else:
282300
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
283301
document = loader.load()
284302

285303
if not document or not document[0].page_content.strip():
286-
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
304+
raise ValueError("""No HTML body content found in
305+
the document fetched by ChromiumLoader.""")
287306
parsed_content = document[0].page_content
288307

289308
if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
@@ -292,7 +311,7 @@ def handle_web_source(self, state, source):
292311
compressed_document = [
293312
Document(page_content=parsed_content, metadata={"source": "html file"})
294313
]
295-
314+
296315
return self.update_state(state, compressed_document)
297316

298317
def update_state(self, state, compressed_document):

0 commit comments

Comments
 (0)