Skip to content

Commit 78343e5

Browse files
authored
Merge pull request #499 from ScrapeGraphAI/browserbase_integration
Browserbase integration
2 parents 55f706f + fe099f9 commit 78343e5

File tree

8 files changed

+85
-11
lines changed

8 files changed

+85
-11
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperGraph
9+
from scrapegraphai.utils import prettify_exec_info
10+
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
18+
graph_config = {
19+
"llm": {
20+
"api_key": os.getenv("OPENAI_API_KEY"),
21+
"model": "gpt-3.5-turbo",
22+
},
23+
"browser_base": {
24+
"api_key": os.getenv("BROWSER_BASE_API_KEY"),
25+
"project_id": os.getenv("BROWSER_BASE_API_KEY"),
26+
},
27+
"verbose": True,
28+
"headless": False,
29+
}
30+
31+
# ************************************************
32+
# Create the SmartScraperGraph instance and run it
33+
# ************************************************
34+
35+
smart_scraper_graph = SmartScraperGraph(
36+
prompt="List me what does the company do, the name and a contact email.",
37+
source="https://scrapegraphai.com/",
38+
config=graph_config
39+
)
40+
41+
result = smart_scraper_graph.run()
42+
print(json.dumps(result, indent=4))
43+
44+
# ************************************************
45+
# Get graph execution info
46+
# ************************************************
47+
48+
graph_exec_info = smart_scraper_graph.get_execution_info()
49+
print(prettify_exec_info(graph_exec_info))

requirements-dev.lock

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ boto3==1.34.146
5454
botocore==1.34.146
5555
# via boto3
5656
# via s3transfer
57+
browserbase==0.3.0
58+
# via scrapegraphai
5759
burr==0.22.1
5860
# via scrapegraphai
5961
cachetools==5.4.0
@@ -208,6 +210,7 @@ httptools==0.6.1
208210
# via uvicorn
209211
httpx==0.27.0
210212
# via anthropic
213+
# via browserbase
211214
# via fastapi
212215
# via fireworks-ai
213216
# via groq
@@ -383,6 +386,7 @@ pillow==10.4.0
383386
platformdirs==4.2.2
384387
# via pylint
385388
playwright==1.45.0
389+
# via browserbase
386390
# via scrapegraphai
387391
# via undetected-playwright
388392
pluggy==1.5.0
@@ -412,6 +416,7 @@ pyasn1-modules==0.4.0
412416
# via google-auth
413417
pydantic==2.8.2
414418
# via anthropic
419+
# via browserbase
415420
# via burr
416421
# via fastapi
417422
# via fastapi-pagination

requirements.lock

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ boto3==1.34.146
3737
botocore==1.34.146
3838
# via boto3
3939
# via s3transfer
40+
browserbase==0.3.0
41+
# via scrapegraphai
4042
cachetools==5.4.0
4143
# via google-auth
4244
certifi==2024.7.4
@@ -153,6 +155,7 @@ httplib2==0.22.0
153155
# via google-auth-httplib2
154156
httpx==0.27.0
155157
# via anthropic
158+
# via browserbase
156159
# via fireworks-ai
157160
# via groq
158161
# via openai
@@ -275,6 +278,7 @@ pillow==10.4.0
275278
# via langchain-nvidia-ai-endpoints
276279
# via sentence-transformers
277280
playwright==1.45.0
281+
# via browserbase
278282
# via scrapegraphai
279283
# via undetected-playwright
280284
proto-plus==1.24.0
@@ -299,6 +303,7 @@ pyasn1-modules==0.4.0
299303
# via google-auth
300304
pydantic==2.8.2
301305
# via anthropic
306+
# via browserbase
302307
# via fireworks-ai
303308
# via google-cloud-aiplatform
304309
# via google-generativeai

scrapegraphai/docloaders/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""__init__.py file for docloaders folder"""
22

33
from .chromium import ChromiumLoader
4-
from .broswer_base import browser_base_fetch
4+
from .browser_base import browser_base_fetch

scrapegraphai/docloaders/broswer_base.py renamed to scrapegraphai/docloaders/browser_base.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
"""
22
browserbase integration module
33
"""
4+
from typing import List
45
from browserbase import Browserbase
56

6-
def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
7+
def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]:
78
"""
89
BrowserBase Fetch
910
@@ -15,7 +16,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
1516
- `link`: The URL or link that you want to fetch data from.
1617
1718
It initializes a Browserbase object with the given API key and project ID,
18-
then uses this object to load the specified link. It returns the result of the loading operation.
19+
then uses this object to load the specified link.
20+
It returns the result of the loading operation.
1921
2022
Example usage:
2123
@@ -41,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
4143

4244
browserbase = Browserbase(api_key=api_key, project_id=project_id)
4345

44-
result = browserbase.load(link)
46+
result = browserbase.load([link])
4547

4648
return result

scrapegraphai/docloaders/chromium.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
"""
2+
Chromium module
3+
"""
14
import asyncio
25
from typing import Any, AsyncIterator, Iterator, List, Optional
36

scrapegraphai/graphs/abstract_graph.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,11 @@ def __init__(self, prompt: str, config: dict,
5959
self.llm_model = self._create_llm(config["llm"])
6060
self.verbose = False if config is None else config.get(
6161
"verbose", False)
62-
self.headless = True if config is None else config.get(
62+
self.headless = True if self.config is None else config.get(
6363
"headless", True)
64-
self.loader_kwargs = config.get("loader_kwargs", {})
65-
self.cache_path = config.get("cache_path", False)
64+
self.loader_kwargs = self.config.get("loader_kwargs", {})
65+
self.cache_path = self.config.get("cache_path", False)
66+
self.browser_base = self.config.get("browser_base")
6667

6768
# Create the graph
6869
self.graph = self._create_graph()

scrapegraphai/nodes/fetch_node.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from langchain_core.documents import Document
1212
from ..utils.cleanup_html import cleanup_html
1313
from ..docloaders import ChromiumLoader
14+
from ..docloaders.browser_base import browser_base_fetch
1415
from ..utils.convert_to_md import convert_to_md
1516
from ..utils.logging import get_logger
1617
from .base_node import BaseNode
@@ -74,6 +75,8 @@ def __init__(
7475
False if node_config is None else node_config.get("cut", True)
7576
)
7677

78+
self.browser_base = node_config.get("browser_base")
79+
7780
def execute(self, state):
7881
"""
7982
Executes the node's logic to fetch HTML content from a specified URL and
@@ -164,7 +167,7 @@ def execute(self, state):
164167

165168
parsed_content = source
166169

167-
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
170+
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
168171
parsed_content = convert_to_md(source)
169172

170173
compressed_document = [
@@ -177,7 +180,7 @@ def execute(self, state):
177180
if response.status_code == 200:
178181
if not response.text.strip():
179182
raise ValueError("No HTML body content found in the response.")
180-
183+
181184
parsed_content = response
182185

183186
if not self.cut:
@@ -198,8 +201,14 @@ def execute(self, state):
198201
if self.node_config is not None:
199202
loader_kwargs = self.node_config.get("loader_kwargs", {})
200203

201-
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
202-
document = loader.load()
204+
if self.browser_base is not None:
205+
data = browser_base_fetch(self.browser_base.get("api_key"),
206+
self.browser_base.get("project_id"), [source])
207+
208+
document = [Document(page_content=content, metadata={"source": source}) for content in data]
209+
else:
210+
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
211+
document = loader.load()
203212

204213
if not document or not document[0].page_content.strip():
205214
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")

0 commit comments

Comments
 (0)