8
8
SearchLinkNode ,
9
9
ParseNode ,
10
10
RAGNode ,
11
- GenerateAnswerNode
11
+ GenerateAnswerNode ,
12
+ GraphIteratorNode ,
13
+ MergeAnswersNode
12
14
)
13
15
from .abstract_graph import AbstractGraph
14
16
@@ -18,12 +20,11 @@ class DeepScraperGraph(AbstractGraph):
18
20
[WIP]
19
21
20
22
DeepScraper is a scraping pipeline that automates the process of
21
- extracting information from web pages
22
- using a natural language model to interpret and answer prompts.
23
-
24
- Unlike SmartScraper, DeepScraper can navigate to the links within the input webpage,
25
- to fuflfil the task within the prompt.
23
+ extracting information from web pages using a natural language model
24
+ to interpret and answer prompts.
26
25
26
+ Unlike SmartScraper, DeepScraper can navigate to the links within,
27
+ the input webpage to fuflfil the task within the prompt.
27
28
28
29
Attributes:
29
30
prompt (str): The prompt for the graph.
@@ -50,12 +51,13 @@ class DeepScraperGraph(AbstractGraph):
50
51
51
52
def __init__ (self , prompt : str , source : str , config : dict ):
52
53
super ().__init__ (prompt , config , source )
53
-
54
54
self .input_key = "url" if source .startswith ("http" ) else "local_dir"
55
55
56
- def _create_graph (self ) -> BaseGraph :
56
+ def _create_repeated_graph (self ) -> BaseGraph :
57
57
"""
58
- Creates the graph of nodes representing the workflow for web scraping.
58
+ Creates the graph that can be repeatedly executed to conduct search on
59
+ hyperlinks within the webpage.
60
+
59
61
Returns:
60
62
BaseGraph: A graph instance representing the web scraping workflow.
61
63
"""
@@ -78,6 +80,13 @@ def _create_graph(self) -> BaseGraph:
78
80
"embedder_model" : self .embedder_model
79
81
}
80
82
)
83
+ generate_answer_node = GenerateAnswerNode (
84
+ input = "user_prompt & (relevant_chunks | parsed_doc | doc)" ,
85
+ output = ["answer" ],
86
+ node_config = {
87
+ "llm_model" : self .llm_model
88
+ }
89
+ )
81
90
search_node = SearchLinkNode (
82
91
input = "user_prompt & relevant_chunks" ,
83
92
output = ["relevant_links" ],
@@ -86,23 +95,60 @@ def _create_graph(self) -> BaseGraph:
86
95
"embedder_model" : self .embedder_model
87
96
}
88
97
)
98
+ graph_iterator_node = GraphIteratorNode (
99
+ input = "user_prompt & relevant_links" ,
100
+ output = ["results" ],
101
+ node_config = {
102
+ "graph_instance" : None ,
103
+ "batchsize" : 1
104
+ }
105
+ )
106
+ merge_answers_node = MergeAnswersNode (
107
+ input = "user_prompt & results" ,
108
+ output = ["answer" ],
109
+ node_config = {
110
+ "llm_model" : self .llm_model ,
111
+ }
112
+ )
89
113
90
114
return BaseGraph (
91
115
nodes = [
92
116
fetch_node ,
93
117
parse_node ,
94
118
rag_node ,
95
- search_node
119
+ generate_answer_node ,
120
+ search_node ,
121
+ graph_iterator_node ,
122
+ merge_answers_node
96
123
],
97
124
edges = [
98
125
(fetch_node , parse_node ),
99
126
(parse_node , rag_node ),
100
- (rag_node , search_node )
101
-
127
+ (rag_node , generate_answer_node ),
128
+ (rag_node , search_node ),
129
+ (search_node , graph_iterator_node ),
130
+ (graph_iterator_node , merge_answers_node )
102
131
],
103
132
entry_point = fetch_node
104
133
)
105
134
135
+
136
+
137
+ def _create_graph (self ) -> BaseGraph :
138
+ """
139
+ Creates the graph of nodes representing the workflow for web scraping
140
+ n-levels deep.
141
+
142
+ Returns:
143
+ BaseGraph: A graph instance representing the web scraping workflow.
144
+ """
145
+
146
+ base_graph = self ._create_repeated_graph ()
147
+ graph_iterator_node = list (filter (lambda x : x .node_name == "GraphIterator" , base_graph .nodes ))[0 ]
148
+ # Graph iterator will repeat the same graph for multiple hyperlinks found within input webpage
149
+ graph_iterator_node .node_config ["graph_instance" ] = self
150
+ return base_graph
151
+
106
152
def run (self ) -> str :
107
153
"""
108
154
Executes the scraping process and returns the answer to the prompt.
0 commit comments