Skip to content

Commit 1d7f30b

Browse files
committed
fix: browser-base integration
1 parent a6fcc1e commit 1d7f30b

File tree

5 files changed

+23
-20
lines changed

5 files changed

+23
-20
lines changed

examples/extras/browser_base_integration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
graph_config = {
1919
"llm": {
2020
"api_key": os.getenv("OPENAI_API_KEY"),
21-
"model": "gpt-3.5-turbo",
21+
"model": "gpt-4o",
2222
},
2323
"browser_base": {
2424
"api_key": os.getenv("BROWSER_BASE_API_KEY"),

scrapegraphai/docloaders/browser_base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s
4343

4444
browserbase = Browserbase(api_key=api_key, project_id=project_id)
4545

46-
result = browserbase.load([link])
46+
result = []
47+
for l in link:
48+
result.append(browserbase.load(l, text_content=True))
4749

4850
return result

scrapegraphai/graphs/abstract_graph.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,10 @@ def handle_model(model_name, provider, token_key, default_token=8192):
146146
with warnings.catch_warnings():
147147
warnings.simplefilter("ignore")
148148
return init_chat_model(**llm_params)
149-
150-
known_models = ["openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]
151149

152-
if llm_params["model"].split("/")[0] not in known_models:
150+
known_models = ["gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]
151+
152+
if llm_params["model"].split("/")[0] not in known_models and llm_params["model"].split("-")[0] not in known_models:
153153
raise ValueError(f"Model '{llm_params['model']}' is not supported")
154154

155155
try:

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def _create_graph(self) -> BaseGraph:
6767
"force": self.config.get("force", False),
6868
"cut": self.config.get("cut", True),
6969
"loader_kwargs": self.config.get("loader_kwargs", {}),
70+
"browser_base": self.config.get("browser_base")
7071
}
7172
)
7273
parse_node = ParseNode(

scrapegraphai/nodes/fetch_node.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def execute(self, state):
121121
"xml": self.handle_file,
122122
"md": self.handle_file,
123123
}
124-
124+
125125
if input_type in handlers:
126126
return handlers[input_type](state, input_type, source)
127127
elif self.input == "pdf_dir":
@@ -130,7 +130,7 @@ def execute(self, state):
130130
return self.handle_local_source(state, source)
131131
else:
132132
return self.handle_web_source(state, source)
133-
133+
134134
def handle_directory(self, state, input_type, source):
135135
"""
136136
Handles the directory by compressing the source document and updating the state.
@@ -143,7 +143,7 @@ def handle_directory(self, state, input_type, source):
143143
Returns:
144144
dict: The updated state with the compressed document.
145145
"""
146-
146+
147147
compressed_document = [
148148
source
149149
]
@@ -169,11 +169,11 @@ def handle_file(self, state, input_type, source):
169169
- "xml": Reads the content of an XML file as a string.
170170
- "md": Reads the content of a Markdown file as a string.
171171
"""
172-
172+
173173
compressed_document = self.load_file_content(source, input_type)
174-
174+
175175
return self.update_state(state, compressed_document)
176-
176+
177177
def load_file_content(self, source, input_type):
178178
"""
179179
Loads the content of a file based on its input type.
@@ -185,7 +185,7 @@ def load_file_content(self, source, input_type):
185185
Returns:
186186
list: A list containing a Document object with the loaded content and metadata.
187187
"""
188-
188+
189189
if input_type == "pdf":
190190
loader = PyPDFLoader(source)
191191
return loader.load()
@@ -198,7 +198,7 @@ def load_file_content(self, source, input_type):
198198
with open(source, "r", encoding="utf-8") as f:
199199
data = f.read()
200200
return [Document(page_content=data, metadata={"source": input_type})]
201-
201+
202202
def handle_local_source(self, state, source):
203203
"""
204204
Handles the local source by fetching HTML content, optionally converting it to Markdown,
@@ -214,11 +214,11 @@ def handle_local_source(self, state, source):
214214
Raises:
215215
ValueError: If the source is empty or contains only whitespace.
216216
"""
217-
217+
218218
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
219219
if not source.strip():
220220
raise ValueError("No HTML body content found in the local source.")
221-
221+
222222
parsed_content = source
223223

224224
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
@@ -229,13 +229,13 @@ def handle_local_source(self, state, source):
229229
compressed_document = [
230230
Document(page_content=parsed_content, metadata={"source": "local_dir"})
231231
]
232-
232+
233233
return self.update_state(state, compressed_document)
234-
234+
235235
def handle_web_source(self, state, source):
236236
"""
237-
Handles the web source by fetching HTML content from a URL, optionally converting it to Markdown,
238-
and updating the state.
237+
Handles the web source by fetching HTML content from a URL,
238+
optionally converting it to Markdown, and updating the state.
239239
240240
Parameters:
241241
state (dict): The current state of the graph.
@@ -247,7 +247,7 @@ def handle_web_source(self, state, source):
247247
Raises:
248248
ValueError: If the fetched HTML content is empty or contains only whitespace.
249249
"""
250-
250+
251251
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
252252
if self.use_soup:
253253
response = requests.get(source)

0 commit comments

Comments
 (0)