Skip to content

Browserbase integration #499

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions examples/extras/browser_base_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
Basic example of scraping pipeline using SmartScraper
"""

import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()

# ************************************************
# Define the configuration for the graph
# ************************************************


graph_config = {
"llm": {
"api_key": os.getenv("OPENAI_API_KEY"),
"model": "gpt-3.5-turbo",
},
"browser_base": {
"api_key": os.getenv("BROWSER_BASE_API_KEY"),
"project_id": os.getenv("BROWSER_BASE_API_KEY"),
},
"verbose": True,
"headless": False,
}

# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************

smart_scraper_graph = SmartScraperGraph(
prompt="List me what does the company do, the name and a contact email.",
source="https://scrapegraphai.com/",
config=graph_config
)

result = smart_scraper_graph.run()
print(json.dumps(result, indent=4))

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
5 changes: 5 additions & 0 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ boto3==1.34.146
botocore==1.34.146
# via boto3
# via s3transfer
browserbase==0.3.0
# via scrapegraphai
burr==0.22.1
# via scrapegraphai
cachetools==5.4.0
Expand Down Expand Up @@ -208,6 +210,7 @@ httptools==0.6.1
# via uvicorn
httpx==0.27.0
# via anthropic
# via browserbase
# via fastapi
# via fireworks-ai
# via groq
Expand Down Expand Up @@ -383,6 +386,7 @@ pillow==10.4.0
platformdirs==4.2.2
# via pylint
playwright==1.45.0
# via browserbase
# via scrapegraphai
# via undetected-playwright
pluggy==1.5.0
Expand Down Expand Up @@ -412,6 +416,7 @@ pyasn1-modules==0.4.0
# via google-auth
pydantic==2.8.2
# via anthropic
# via browserbase
# via burr
# via fastapi
# via fastapi-pagination
Expand Down
5 changes: 5 additions & 0 deletions requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ boto3==1.34.146
botocore==1.34.146
# via boto3
# via s3transfer
browserbase==0.3.0
# via scrapegraphai
cachetools==5.4.0
# via google-auth
certifi==2024.7.4
Expand Down Expand Up @@ -153,6 +155,7 @@ httplib2==0.22.0
# via google-auth-httplib2
httpx==0.27.0
# via anthropic
# via browserbase
# via fireworks-ai
# via groq
# via openai
Expand Down Expand Up @@ -275,6 +278,7 @@ pillow==10.4.0
# via langchain-nvidia-ai-endpoints
# via sentence-transformers
playwright==1.45.0
# via browserbase
# via scrapegraphai
# via undetected-playwright
proto-plus==1.24.0
Expand All @@ -299,6 +303,7 @@ pyasn1-modules==0.4.0
# via google-auth
pydantic==2.8.2
# via anthropic
# via browserbase
# via fireworks-ai
# via google-cloud-aiplatform
# via google-generativeai
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/docloaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""__init__.py file for docloaders folder"""

from .chromium import ChromiumLoader
from .broswer_base import browser_base_fetch
from .browser_base import browser_base_fetch
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""
browserbase integration module
"""
from typing import List
from browserbase import Browserbase

def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]:
"""
BrowserBase Fetch

Expand All @@ -15,7 +16,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
- `link`: The URL or link that you want to fetch data from.

It initializes a Browserbase object with the given API key and project ID,
then uses this object to load the specified link. It returns the result of the loading operation.
then uses this object to load the specified link.
It returns the result of the loading operation.

Example usage:

Expand All @@ -41,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:

browserbase = Browserbase(api_key=api_key, project_id=project_id)

result = browserbase.load(link)
result = browserbase.load([link])

return result
3 changes: 3 additions & 0 deletions scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""
Chromium module
"""
import asyncio
from typing import Any, AsyncIterator, Iterator, List, Optional

Expand Down
7 changes: 4 additions & 3 deletions scrapegraphai/graphs/abstract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,11 @@ def __init__(self, prompt: str, config: dict,
self.llm_model = self._create_llm(config["llm"])
self.verbose = False if config is None else config.get(
"verbose", False)
self.headless = True if config is None else config.get(
self.headless = True if self.config is None else config.get(
"headless", True)
self.loader_kwargs = config.get("loader_kwargs", {})
self.cache_path = config.get("cache_path", False)
self.loader_kwargs = self.config.get("loader_kwargs", {})
self.cache_path = self.config.get("cache_path", False)
self.browser_base = self.config.get("browser_base")

# Create the graph
self.graph = self._create_graph()
Expand Down
17 changes: 13 additions & 4 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from langchain_core.documents import Document
from ..utils.cleanup_html import cleanup_html
from ..docloaders import ChromiumLoader
from ..docloaders.browser_base import browser_base_fetch
from ..utils.convert_to_md import convert_to_md
from ..utils.logging import get_logger
from .base_node import BaseNode
Expand Down Expand Up @@ -74,6 +75,8 @@ def __init__(
False if node_config is None else node_config.get("cut", True)
)

self.browser_base = node_config.get("browser_base")

def execute(self, state):
"""
Executes the node's logic to fetch HTML content from a specified URL and
Expand Down Expand Up @@ -164,7 +167,7 @@ def execute(self, state):

parsed_content = source

if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
parsed_content = convert_to_md(source)

compressed_document = [
Expand All @@ -177,7 +180,7 @@ def execute(self, state):
if response.status_code == 200:
if not response.text.strip():
raise ValueError("No HTML body content found in the response.")

parsed_content = response

if not self.cut:
Expand All @@ -198,8 +201,14 @@ def execute(self, state):
if self.node_config is not None:
loader_kwargs = self.node_config.get("loader_kwargs", {})

loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
document = loader.load()
if self.browser_base is not None:
data = browser_base_fetch(self.browser_base.get("api_key"),
self.browser_base.get("project_id"), [source])

document = [Document(page_content=content, metadata={"source": source}) for content in data]
else:
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
document = loader.load()

if not document or not document[0].page_content.strip():
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
Expand Down
Loading