Skip to content

Commit 94e69a0

Browse files
committed
feat: add scrape_do_integration
1 parent 9e9c775 commit 94e69a0

File tree

7 files changed

+85
-6
lines changed

7 files changed

+85
-6
lines changed

examples/extras/browser_base_integration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
graph_config = {
1919
"llm": {
2020
"api_key": os.getenv("OPENAI_API_KEY"),
21-
"model": "gpt-4o",
21+
"model": "openai/gpt-4o",
2222
},
2323
"browser_base": {
2424
"api_key": os.getenv("BROWSER_BASE_API_KEY"),

examples/extras/scrape_do.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
17+
graph_config = {
18+
"llm": {
19+
"api_key": os.getenv("OPENAI_API_KEY"),
20+
"model": "openai/gpt-4o",
21+
},
22+
"scrape_do": {
23+
"api_key": os.getenv("SCRAPE_DO_API_KEY"),
24+
},
25+
"verbose": True,
26+
"headless": False,
27+
}
28+
29+
# ************************************************
30+
# Create the SmartScraperGraph instance and run it
31+
# ************************************************
32+
33+
smart_scraper_graph = SmartScraperGraph(
34+
prompt="List me what does the company do, the name and a contact email.",
35+
source="https://scrapegraphai.com/",
36+
config=graph_config
37+
)
38+
39+
result = smart_scraper_graph.run()
40+
print(json.dumps(result, indent=4))

scrapegraphai/docloaders/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22

33
from .chromium import ChromiumLoader
44
from .browser_base import browser_base_fetch
5+
from .scrape_do import scrape_do_fetch

scrapegraphai/docloaders/scrape_do.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
"""
2+
scrape_do module
3+
"""
4+
import urllib.parse
5+
import requests
6+
7+
def scrape_do_fetch(token, target_url):
8+
"""
9+
This function takes a token and a URL as inputs.
10+
It returns the IP address of the machine associated with the given URL.
11+
12+
Args:
13+
token (str): The API token for scrape.do service.
14+
target_url (str): A valid web page URL to fetch its associated IP address.
15+
16+
Returns:
17+
str: The IP address of the machine associated with the target URL.
18+
"""
19+
20+
encoded_url = urllib.parse.quote(target_url)
21+
url = f"http://api.scrape.do?token={token}&url={encoded_url}"
22+
response = requests.request("GET", url)
23+
return response.text

scrapegraphai/graphs/abstract_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def __init__(self, prompt: str, config: dict,
6363
self.loader_kwargs = self.config.get("loader_kwargs", {})
6464
self.cache_path = self.config.get("cache_path", False)
6565
self.browser_base = self.config.get("browser_base")
66+
self.scrape_do = self.config.get("scrape_do")
6667

6768
self.graph = self._create_graph()
6869
self.final_state = None

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ def _create_graph(self) -> BaseGraph:
6767
"force": self.config.get("force", False),
6868
"cut": self.config.get("cut", True),
6969
"loader_kwargs": self.config.get("loader_kwargs", {}),
70-
"browser_base": self.config.get("browser_base")
70+
"browser_base": self.config.get("browser_base"),
71+
"scrape_do": self.config.get("scrape_do")
7172
}
7273
)
7374
parse_node = ParseNode(

scrapegraphai/nodes/fetch_node.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ def __init__(
7676
None if node_config is None else node_config.get("browser_base", None)
7777
)
7878

79+
self.scrape_do = (
80+
None if node_config is None else node_config.get("scrape_do", None)
81+
)
82+
7983
def execute(self, state):
8084
"""
8185
Executes the node's logic to fetch HTML content from a specified URL and
@@ -102,7 +106,7 @@ def execute(self, state):
102106

103107
source = input_data[0]
104108
input_type = input_keys[0]
105-
109+
106110
handlers = {
107111
"json_dir": self.handle_directory,
108112
"xml_dir": self.handle_directory,
@@ -271,19 +275,28 @@ def handle_web_source(self, state, source):
271275
try:
272276
from ..docloaders.browser_base import browser_base_fetch
273277
except ImportError:
274-
raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.")
278+
raise ImportError("""The browserbase module is not installed.
279+
Please install it using `pip install browserbase`.""")
275280

276281
data = browser_base_fetch(self.browser_base.get("api_key"),
277282
self.browser_base.get("project_id"), [source])
278283

284+
document = [Document(page_content=content,
285+
metadata={"source": source}) for content in data]
286+
elif self.scrape_do is not None:
287+
from ..docloaders.scrape_do import scrape_do_fetch
288+
data = scrape_do_fetch(self.scrape_do.get("api_key"),
289+
source)
290+
279291
document = [Document(page_content=content,
280292
metadata={"source": source}) for content in data]
281293
else:
282294
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
283295
document = loader.load()
284296

285297
if not document or not document[0].page_content.strip():
286-
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
298+
raise ValueError("""No HTML body content found in
299+
the document fetched by ChromiumLoader.""")
287300
parsed_content = document[0].page_content
288301

289302
if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
@@ -292,7 +305,7 @@ def handle_web_source(self, state, source):
292305
compressed_document = [
293306
Document(page_content=parsed_content, metadata={"source": "html file"})
294307
]
295-
308+
296309
return self.update_state(state, compressed_document)
297310

298311
def update_state(self, state, compressed_document):

0 commit comments

Comments
 (0)