Skip to content

Commit 5ecdbe7

Browse files
committed
feat: add integration in the abstract grapgh
1 parent 7076ab1 commit 5ecdbe7

File tree

4 files changed

+27
-14
lines changed

4 files changed

+27
-14
lines changed

examples/extras/browser_base.py renamed to examples/extras/browser_base_integration.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
Basic example of scraping pipeline using SmartScraper
33
"""
44

5-
import os, json
5+
import os
6+
import json
7+
from dotenv import load_dotenv
68
from scrapegraphai.graphs import SmartScraperGraph
79
from scrapegraphai.utils import prettify_exec_info
8-
from dotenv import load_dotenv
10+
911
load_dotenv()
1012

1113
# ************************************************

scrapegraphai/docloaders/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""__init__.py file for docloaders folder"""
22

33
from .chromium import ChromiumLoader
4-
from .broswer_base import browser_base_fetch
4+
from .browser_base import browser_base_fetch

scrapegraphai/graphs/abstract_graph.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,16 @@ def __init__(self, prompt: str, config: dict,
7272
self.source = source
7373
self.config = config
7474
self.schema = schema
75-
self.llm_model = self._create_llm(config["llm"], chat=True)
76-
self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder(
77-
config["embeddings"])
78-
self.verbose = False if config is None else config.get(
75+
self.llm_model = self._create_llm(self.config["llm"], chat=True)
76+
self.embedder_model = self._create_default_embedder(llm_config=self.config["llm"]) if "embeddings" not in self.config else self._create_embedder(
77+
self.config["embeddings"])
78+
self.verbose = False if self.config is None else self.config.get(
7979
"verbose", False)
80-
self.headless = True if config is None else config.get(
80+
self.headless = True if self.config is None else config.get(
8181
"headless", True)
82-
self.loader_kwargs = config.get("loader_kwargs", {})
83-
self.cache_path = config.get("cache_path", False)
82+
self.loader_kwargs = self.config.get("loader_kwargs", {})
83+
self.cache_path = self.config.get("cache_path", False)
84+
self.browser_base = self.config.get("browser_base")
8485

8586
# Create the graph
8687
self.graph = self._create_graph()

scrapegraphai/nodes/fetch_node.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from langchain_core.documents import Document
1212
from ..utils.cleanup_html import cleanup_html
1313
from ..docloaders import ChromiumLoader
14+
from ..docloaders.browser_base import browser_base_fetch
1415
from ..utils.convert_to_md import convert_to_md
1516
from ..utils.logging import get_logger
1617
from .base_node import BaseNode
@@ -74,6 +75,8 @@ def __init__(
7475
False if node_config is None else node_config.get("cut", True)
7576
)
7677

78+
self.browser_base = node_config.get("browser_base")
79+
7780
def execute(self, state):
7881
"""
7982
Executes the node's logic to fetch HTML content from a specified URL and
@@ -164,7 +167,7 @@ def execute(self, state):
164167

165168
parsed_content = source
166169

167-
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
170+
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
168171
parsed_content = convert_to_md(source)
169172

170173
compressed_document = [
@@ -177,7 +180,7 @@ def execute(self, state):
177180
if response.status_code == 200:
178181
if not response.text.strip():
179182
raise ValueError("No HTML body content found in the response.")
180-
183+
181184
parsed_content = response
182185

183186
if not self.cut:
@@ -198,8 +201,15 @@ def execute(self, state):
198201
if self.node_config is not None:
199202
loader_kwargs = self.node_config.get("loader_kwargs", {})
200203

201-
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
202-
document = loader.load()
204+
if self.browser_base is not None:
205+
document = [
206+
Document(page_content= browser_base_fetch(self.browser_base.get("api_key"),
207+
self.browser_base.get("project_id"), source),
208+
metadata={})
209+
]
210+
else:
211+
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
212+
document = loader.load()
203213

204214
if not document or not document[0].page_content.strip():
205215
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")

0 commit comments

Comments
 (0)