Skip to content

Commit dbb614a

Browse files
committed
feat: multiple graph instances
1 parent 1c4ba91 commit dbb614a

16 files changed

+245
-60
lines changed

scrapegraphai/nodes/graphs_iterator_node.py renamed to examples/openai/search_graph_multi.py

Lines changed: 26 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
from dotenv import load_dotenv
77
from langchain_openai import OpenAIEmbeddings
88
from scrapegraphai.models import OpenAI
9-
from scrapegraphai.graphs import BaseGraph
10-
from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, SearchInternetNode
9+
from scrapegraphai.graphs import BaseGraph, SmartScraperGraph
10+
from scrapegraphai.nodes import SearchInternetNode, GraphIteratorNode, MergeAnswersNode
1111
load_dotenv()
1212

1313
# ************************************************
@@ -23,6 +23,16 @@
2323
},
2424
}
2525

26+
# ************************************************
27+
# Create a SmartScraperGraph instance
28+
# ************************************************
29+
30+
smart_scraper_graph = SmartScraperGraph(
31+
prompt="",
32+
source="",
33+
config=graph_config
34+
)
35+
2636
# ************************************************
2737
# Define the graph nodes
2838
# ************************************************
@@ -32,38 +42,24 @@
3242

3343
search_internet_node = SearchInternetNode(
3444
input="user_prompt",
35-
output=["url"],
36-
node_config={
37-
"llm_model": llm_model
38-
}
39-
)
40-
fetch_node = FetchNode(
41-
input="url | local_dir",
42-
output=["doc"],
45+
output=["urls"],
4346
node_config={
44-
"verbose": True,
45-
"headless": True,
46-
}
47-
)
48-
parse_node = ParseNode(
49-
input="doc",
50-
output=["parsed_doc"],
51-
node_config={
52-
"chunk_size": 4096,
47+
"llm_model": llm_model,
5348
"verbose": True,
5449
}
5550
)
56-
rag_node = RAGNode(
57-
input="user_prompt & (parsed_doc | doc)",
58-
output=["relevant_chunks"],
51+
52+
graph_iterator_node = GraphIteratorNode(
53+
input="user_prompt & urls",
54+
output=["results"],
5955
node_config={
60-
"llm_model": llm_model,
61-
"embedder_model": embedder,
56+
"graph_instance": smart_scraper_graph,
6257
"verbose": True,
6358
}
6459
)
65-
generate_answer_node = GenerateAnswerNode(
66-
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
60+
61+
merge_answers_node = MergeAnswersNode(
62+
input="user_prompt & results",
6763
output=["answer"],
6864
node_config={
6965
"llm_model": llm_model,
@@ -78,16 +74,12 @@
7874
graph = BaseGraph(
7975
nodes=[
8076
search_internet_node,
81-
fetch_node,
82-
parse_node,
83-
rag_node,
84-
generate_answer_node,
77+
graph_iterator_node,
78+
merge_answers_node
8579
],
8680
edges=[
87-
(search_internet_node, fetch_node),
88-
(fetch_node, parse_node),
89-
(parse_node, rag_node),
90-
(rag_node, generate_answer_node)
81+
(search_internet_node, graph_iterator_node),
82+
(graph_iterator_node, merge_answers_node)
9183
],
9284
entry_point=search_internet_node
9385
)

scrapegraphai/nodes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,5 @@
1717
from .robots_node import RobotsNode
1818
from .generate_answer_csv_node import GenerateAnswerCSVNode
1919
from .generate_answer_pdf_node import GenerateAnswerPDFNode
20+
from .graph_iterator_node import GraphIteratorNode
21+
from .merge_answers_node import MergeAnswersNode

scrapegraphai/nodes/generate_answer_csv_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Module for generating the answer node
33
"""
44
# Imports from standard library
5-
from typing import List
5+
from typing import List, Optional
66
from tqdm import tqdm
77

88
# Imports from Langchain
@@ -39,7 +39,7 @@ class GenerateAnswerCSVNode(BaseNode):
3939
updating the state with the generated answer under the 'answer' key.
4040
"""
4141

42-
def __init__(self, input: str, output: List[str], node_config: dict,
42+
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None,
4343
node_name: str = "GenerateAnswer"):
4444
"""
4545
Initializes the GenerateAnswerNodeCsv with a language model client and a node name.

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
# Imports from standard library
6-
from typing import List
6+
from typing import List, Optional
77
from tqdm import tqdm
88

99
# Imports from Langchain
@@ -33,7 +33,7 @@ class GenerateAnswerNode(BaseNode):
3333
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
3434
"""
3535

36-
def __init__(self, input: str, output: List[str], node_config: dict,
36+
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None,
3737
node_name: str = "GenerateAnswer"):
3838
super().__init__(node_name, "node", input, output, 2, node_config)
3939

scrapegraphai/nodes/generate_answer_node_csv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Module for generating the answer node
33
"""
44
# Imports from standard library
5-
from typing import List
5+
from typing import List, Optional
66
from tqdm import tqdm
77

88
# Imports from Langchain
@@ -39,7 +39,7 @@ class GenerateAnswerCSVNode(BaseNode):
3939
updating the state with the generated answer under the 'answer' key.
4040
"""
4141

42-
def __init__(self, input: str, output: List[str], node_config: dict,
42+
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None,
4343
node_name: str = "GenerateAnswer"):
4444
"""
4545
Initializes the GenerateAnswerNodeCsv with a language model client and a node name.

scrapegraphai/nodes/generate_answer_pdf_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Module for generating the answer node
33
"""
44
# Imports from standard library
5-
from typing import List
5+
from typing import List, Optional
66
from tqdm import tqdm
77

88
# Imports from Langchain
@@ -39,7 +39,7 @@ class GenerateAnswerPDFNode(BaseNode):
3939
updating the state with the generated answer under the 'answer' key.
4040
"""
4141

42-
def __init__(self, input: str, output: List[str], node_config: dict,
42+
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None,
4343
node_name: str = "GenerateAnswer"):
4444
"""
4545
Initializes the GenerateAnswerNodePDF with a language model client and a node name.

scrapegraphai/nodes/generate_scraper_node.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
# Imports from standard library
6-
from typing import List
6+
from typing import List, Optional
77
from tqdm import tqdm
88

99
# Imports from Langchain
@@ -36,8 +36,8 @@ class GenerateScraperNode(BaseNode):
3636
3737
"""
3838

39-
def __init__(self, input: str, output: List[str], node_config: dict,
40-
library: str, website: str, node_name: str = "GenerateAnswer"):
39+
def __init__(self, input: str, output: List[str], library: str, website: str,
40+
node_config: Optional[dict]=None, node_name: str = "GenerateAnswer"):
4141
super().__init__(node_name, "node", input, output, 2, node_config)
4242

4343
self.llm_model = node_config["llm_model"]

scrapegraphai/nodes/get_probable_tags_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
GetProbableTagsNode Module
33
"""
44

5-
from typing import List
5+
from typing import List, Optional
66
from langchain.output_parsers import CommaSeparatedListOutputParser
77
from langchain.prompts import PromptTemplate
88
from .base_node import BaseNode
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
GraphIterator Module
3+
"""
4+
5+
from typing import List, Optional
6+
import copy
7+
from tqdm import tqdm
8+
from .base_node import BaseNode
9+
10+
11+
class GraphIteratorNode(BaseNode):
12+
"""
13+
A node responsible for parsing HTML content from a document.
14+
The parsed content is split into chunks for further processing.
15+
16+
This node enhances the scraping workflow by allowing for targeted extraction of
17+
content, thereby optimizing the processing of large HTML documents.
18+
19+
Attributes:
20+
verbose (bool): A flag indicating whether to show print statements during execution.
21+
22+
Args:
23+
input (str): Boolean expression defining the input keys needed from the state.
24+
output (List[str]): List of output keys to be updated in the state.
25+
node_config (dict): Additional configuration for the node.
26+
node_name (str): The unique identifier name for the node, defaulting to "Parse".
27+
"""
28+
29+
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "GraphIterator"):
30+
super().__init__(node_name, "node", input, output, 2, node_config)
31+
32+
self.verbose = False if node_config is None else node_config.get("verbose", False)
33+
34+
def execute(self, state: dict) -> dict:
35+
"""
36+
Executes the node's logic to parse the HTML document content and split it into chunks.
37+
38+
Args:
39+
state (dict): The current state of the graph. The input keys will be used to fetch the
40+
correct data from the state.
41+
42+
Returns:
43+
dict: The updated state with the output key containing the parsed content chunks.
44+
45+
Raises:
46+
KeyError: If the input keys are not found in the state, indicating that the
47+
necessary information for parsing the content is missing.
48+
"""
49+
50+
if self.verbose:
51+
print(f"--- Executing {self.node_name} Node ---")
52+
53+
# Interpret input keys based on the provided input expression
54+
input_keys = self.get_input_keys(state)
55+
56+
# Fetching data from the state based on the input keys
57+
input_data = [state[key] for key in input_keys]
58+
59+
user_prompt = input_data[0]
60+
urls = input_data[1]
61+
62+
graph_instance = self.node_config.get("graph_instance", None)
63+
if graph_instance is None:
64+
raise ValueError("Graph instance is required for graph iteration.")
65+
66+
# set the prompt and source for each url
67+
graph_instance.prompt = user_prompt
68+
graphs_instances = []
69+
for url in urls:
70+
# make a copy of the graph instance
71+
copy_graph_instance = copy.copy(graph_instance)
72+
copy_graph_instance.source = url
73+
graphs_instances.append(copy_graph_instance)
74+
75+
# run the graph for each url and use tqdm for progress bar
76+
graphs_answers = []
77+
for graph in tqdm(graphs_instances, desc="Processing Graph Instances", disable=not self.verbose):
78+
result = graph.run()
79+
graphs_answers.append(result)
80+
81+
state.update({self.output[0]: graphs_answers})
82+
83+
return state

scrapegraphai/nodes/image_to_text_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
ImageToTextNode Module
33
"""
44

5-
from typing import List
5+
from typing import List, Optional
66
from .base_node import BaseNode
77

88

@@ -21,7 +21,7 @@ class ImageToTextNode(BaseNode):
2121
node_name (str): The unique identifier name for the node, defaulting to "ImageToText".
2222
"""
2323

24-
def __init__(self, input: str, output: List[str], node_config: dict,
24+
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None,
2525
node_name: str = "ImageToText"):
2626
super().__init__(node_name, "node", input, output, 1, node_config)
2727

0 commit comments

Comments
 (0)