Skip to content

Commit 930adb3

Browse files
committed
feat(node): multiple url search in SearchGraph + fixes
Implemented GraphIteratorNode and MergeAnswersNode to create multiple istances of a graph and merge the scraped content from multiple pages
1 parent dbb614a commit 930adb3

File tree

5 files changed

+50
-52
lines changed

5 files changed

+50
-52
lines changed

examples/openai/search_graph_multi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
output=["urls"],
4646
node_config={
4747
"llm_model": llm_model,
48+
"max_results": 5, # num of search results to fetch
4849
"verbose": True,
4950
}
5051
)

examples/openai/search_graph_openai.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,16 @@
1919
"api_key": openai_key,
2020
"model": "gpt-3.5-turbo",
2121
},
22+
"max_results": 5,
23+
"verbose": True,
2224
}
2325

2426
# ************************************************
2527
# Create the SearchGraph instance and run it
2628
# ************************************************
2729

2830
search_graph = SearchGraph(
29-
prompt="List me top 5 eyeliner products for a gift.",
31+
prompt="List me the best escursions near Trento",
3032
config=graph_config
3133
)
3234

scrapegraphai/graphs/search_graph.py

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@
55
from .base_graph import BaseGraph
66
from ..nodes import (
77
SearchInternetNode,
8-
FetchNode,
9-
ParseNode,
10-
RAGNode,
11-
GenerateAnswerNode
8+
GraphIteratorNode,
9+
MergeAnswersNode
1210
)
1311
from .abstract_graph import AbstractGraph
12+
from .smart_scraper_graph import SmartScraperGraph
1413

1514

1615
class SearchGraph(AbstractGraph):
@@ -38,6 +37,11 @@ class SearchGraph(AbstractGraph):
3837
>>> result = search_graph.run()
3938
"""
4039

40+
def __init__(self, prompt: str, config: dict):
41+
42+
self.max_results = config.get("max_results", 3)
43+
super().__init__(prompt, config)
44+
4145
def _create_graph(self) -> BaseGraph:
4246
"""
4347
Creates the graph of nodes representing the workflow for web scraping and searching.
@@ -46,53 +50,53 @@ def _create_graph(self) -> BaseGraph:
4650
BaseGraph: A graph instance representing the web scraping and searching workflow.
4751
"""
4852

53+
# ************************************************
54+
# Create a SmartScraperGraph instance
55+
# ************************************************
56+
57+
smart_scraper_instance = SmartScraperGraph(
58+
prompt="",
59+
source="",
60+
config=self.config
61+
)
62+
63+
# ************************************************
64+
# Define the graph nodes
65+
# ************************************************
66+
4967
search_internet_node = SearchInternetNode(
5068
input="user_prompt",
51-
output=["url"],
52-
node_config={
53-
"llm_model": self.llm_model
54-
}
55-
)
56-
fetch_node = FetchNode(
57-
input="url | local_dir",
58-
output=["doc"]
59-
)
60-
parse_node = ParseNode(
61-
input="doc",
62-
output=["parsed_doc"],
69+
output=["urls"],
6370
node_config={
64-
"chunk_size": self.model_token
71+
"llm_model": self.llm_model,
72+
"max_results": self.max_results
6573
}
6674
)
67-
rag_node = RAGNode(
68-
input="user_prompt & (parsed_doc | doc)",
69-
output=["relevant_chunks"],
75+
graph_iterator_node = GraphIteratorNode(
76+
input="user_prompt & urls",
77+
output=["results"],
7078
node_config={
71-
"llm_model": self.llm_model,
72-
"embedder_model": self.embedder_model
79+
"graph_instance": smart_scraper_instance,
7380
}
7481
)
75-
generate_answer_node = GenerateAnswerNode(
76-
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
82+
83+
merge_answers_node = MergeAnswersNode(
84+
input="user_prompt & results",
7785
output=["answer"],
7886
node_config={
79-
"llm_model": self.llm_model
87+
"llm_model": self.llm_model,
8088
}
8189
)
8290

8391
return BaseGraph(
8492
nodes=[
8593
search_internet_node,
86-
fetch_node,
87-
parse_node,
88-
rag_node,
89-
generate_answer_node,
94+
graph_iterator_node,
95+
merge_answers_node
9096
],
9197
edges=[
92-
(search_internet_node, fetch_node),
93-
(fetch_node, parse_node),
94-
(parse_node, rag_node),
95-
(rag_node, generate_answer_node)
98+
(search_internet_node, graph_iterator_node),
99+
(graph_iterator_node, merge_answers_node)
96100
],
97101
entry_point=search_internet_node
98102
)

scrapegraphai/nodes/graph_iterator_node.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,8 @@
1010

1111
class GraphIteratorNode(BaseNode):
1212
"""
13-
A node responsible for parsing HTML content from a document.
14-
The parsed content is split into chunks for further processing.
15-
16-
This node enhances the scraping workflow by allowing for targeted extraction of
17-
content, thereby optimizing the processing of large HTML documents.
13+
A node responsible for instantiating and running multiple graph instances in parallel.
14+
It creates as many graph instances as the number of elements in the input list.
1815
1916
Attributes:
2017
verbose (bool): A flag indicating whether to show print statements during execution.
@@ -33,18 +30,18 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict]=No
3330

3431
def execute(self, state: dict) -> dict:
3532
"""
36-
Executes the node's logic to parse the HTML document content and split it into chunks.
33+
Executes the node's logic to instantiate and run multiple graph instances in parallel.
3734
3835
Args:
39-
state (dict): The current state of the graph. The input keys will be used to fetch the
40-
correct data from the state.
36+
state (dict): The current state of the graph. The input keys will be used to fetch
37+
the correct data from the state.
4138
4239
Returns:
43-
dict: The updated state with the output key containing the parsed content chunks.
40+
dict: The updated state with the output key containing the results of the graph instances.
4441
4542
Raises:
4643
KeyError: If the input keys are not found in the state, indicating that the
47-
necessary information for parsing the content is missing.
44+
necessary information for running the graph instances is missing.
4845
"""
4946

5047
if self.verbose:
@@ -79,5 +76,4 @@ def execute(self, state: dict) -> dict:
7976
graphs_answers.append(result)
8077

8178
state.update({self.output[0]: graphs_answers})
82-
8379
return state

scrapegraphai/nodes/merge_answers_node.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,14 @@
99
# Imports from Langchain
1010
from langchain.prompts import PromptTemplate
1111
from langchain_core.output_parsers import JsonOutputParser
12-
from langchain_core.runnables import RunnableParallel
1312

1413
# Imports from the library
1514
from .base_node import BaseNode
1615

1716

1817
class MergeAnswersNode(BaseNode):
1918
"""
20-
A node that generates an answer using a large language model (LLM) based on the user's input
21-
and the content extracted from a webpage. It constructs a prompt from the user's input
22-
and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
23-
an answer.
19+
A node responsible for merging the answers from multiple graph instances into a single answer.
2420
2521
Attributes:
2622
llm_model: An instance of a language model client, configured for generating answers.
@@ -42,8 +38,7 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict]=No
4238

4339
def execute(self, state: dict) -> dict:
4440
"""
45-
Generates an answer by constructing a prompt from the user's input and the scraped
46-
content, querying the language model, and parsing its response.
41+
Executes the node's logic to merge the answers from multiple graph instances into a single answer.
4742
4843
Args:
4944
state (dict): The current state of the graph. The input keys will be used

0 commit comments

Comments
 (0)