Skip to content

Commit f36b3e3

Browse files
authored
Merge pull request #254 from mayurdb/nDeep
feat: n-level deep search
2 parents ba8a4f7 + 1e0b2f7 commit f36b3e3

File tree

4 files changed

+78
-14
lines changed

4 files changed

+78
-14
lines changed

examples/openai/deep_scraper_openai.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"model": "gpt-3.5-turbo",
2323
},
2424
"verbose": True,
25+
"max_depth": 1
2526
}
2627

2728
# ************************************************
@@ -31,7 +32,7 @@
3132
deep_scraper_graph = DeepScraperGraph(
3233
prompt="List me all the contacts",
3334
# also accepts a string with the already downloaded HTML code
34-
source="https://davittoriogift.com/en/home/a",
35+
source="https://www.uber.com/us/en/careers/list/?query=",
3536
config=graph_config
3637
)
3738

scrapegraphai/graphs/base_graph.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,11 @@ def execute(self, initial_state: dict) -> Tuple[dict, list]:
132132
cb_total["successful_requests"] += cb["successful_requests"]
133133
cb_total["total_cost_USD"] += cb["total_cost_USD"]
134134

135-
if current_node in self.edges:
135+
# Do not execute the graph from this point on if previous node gave a signal
136+
if 'skip_branch' in result:
137+
print(f"---- Not executing sub-graph since {current_node.node_name} \
138+
raised a stop signal ---")
139+
elif current_node in self.edges:
136140
current_node_connections = self.edges[current_node]
137141
if current_node.node_type == 'conditional_node':
138142
# Assert that there are exactly two out edges from the conditional node

scrapegraphai/graphs/deep_scraper_graph.py

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
SearchLinkNode,
99
ParseNode,
1010
RAGNode,
11-
GenerateAnswerNode
11+
GenerateAnswerNode,
12+
GraphIteratorNode,
13+
MergeAnswersNode
1214
)
1315
from .abstract_graph import AbstractGraph
1416

@@ -18,12 +20,11 @@ class DeepScraperGraph(AbstractGraph):
1820
[WIP]
1921
2022
DeepScraper is a scraping pipeline that automates the process of
21-
extracting information from web pages
22-
using a natural language model to interpret and answer prompts.
23-
24-
Unlike SmartScraper, DeepScraper can navigate to the links within the input webpage,
25-
to fuflfil the task within the prompt.
23+
extracting information from web pages using a natural language model
24+
to interpret and answer prompts.
2625
26+
Unlike SmartScraper, DeepScraper can navigate to the links within,
27+
the input webpage to fuflfil the task within the prompt.
2728
2829
Attributes:
2930
prompt (str): The prompt for the graph.
@@ -50,12 +51,13 @@ class DeepScraperGraph(AbstractGraph):
5051

5152
def __init__(self, prompt: str, source: str, config: dict):
5253
super().__init__(prompt, config, source)
53-
5454
self.input_key = "url" if source.startswith("http") else "local_dir"
5555

56-
def _create_graph(self) -> BaseGraph:
56+
def _create_repeated_graph(self) -> BaseGraph:
5757
"""
58-
Creates the graph of nodes representing the workflow for web scraping.
58+
Creates the graph that can be repeatedly executed to conduct search on
59+
hyperlinks within the webpage.
60+
5961
Returns:
6062
BaseGraph: A graph instance representing the web scraping workflow.
6163
"""
@@ -78,6 +80,13 @@ def _create_graph(self) -> BaseGraph:
7880
"embedder_model": self.embedder_model
7981
}
8082
)
83+
generate_answer_node = GenerateAnswerNode(
84+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
85+
output=["answer"],
86+
node_config={
87+
"llm_model": self.llm_model
88+
}
89+
)
8190
search_node = SearchLinkNode(
8291
input="user_prompt & relevant_chunks",
8392
output=["relevant_links"],
@@ -86,23 +95,60 @@ def _create_graph(self) -> BaseGraph:
8695
"embedder_model": self.embedder_model
8796
}
8897
)
98+
graph_iterator_node = GraphIteratorNode(
99+
input="user_prompt & relevant_links",
100+
output=["results"],
101+
node_config={
102+
"graph_instance": None,
103+
"batchsize": 1
104+
}
105+
)
106+
merge_answers_node = MergeAnswersNode(
107+
input="user_prompt & results",
108+
output=["answer"],
109+
node_config={
110+
"llm_model": self.llm_model,
111+
}
112+
)
89113

90114
return BaseGraph(
91115
nodes=[
92116
fetch_node,
93117
parse_node,
94118
rag_node,
95-
search_node
119+
generate_answer_node,
120+
search_node,
121+
graph_iterator_node,
122+
merge_answers_node
96123
],
97124
edges=[
98125
(fetch_node, parse_node),
99126
(parse_node, rag_node),
100-
(rag_node, search_node)
101-
127+
(rag_node, generate_answer_node),
128+
(rag_node, search_node),
129+
(search_node, graph_iterator_node),
130+
(graph_iterator_node, merge_answers_node)
102131
],
103132
entry_point=fetch_node
104133
)
105134

135+
136+
137+
def _create_graph(self) -> BaseGraph:
138+
"""
139+
Creates the graph of nodes representing the workflow for web scraping
140+
n-levels deep.
141+
142+
Returns:
143+
BaseGraph: A graph instance representing the web scraping workflow.
144+
"""
145+
146+
base_graph = self._create_repeated_graph()
147+
graph_iterator_node = list(filter(lambda x: x.node_name == "GraphIterator", base_graph.nodes))[0]
148+
# Graph iterator will repeat the same graph for multiple hyperlinks found within input webpage
149+
graph_iterator_node.node_config["graph_instance"] = self
150+
return base_graph
151+
106152
def run(self) -> str:
107153
"""
108154
Executes the scraping process and returns the answer to the prompt.

scrapegraphai/nodes/graph_iterator_node.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,19 @@ async def _async_execute(self, state: dict, batchsize: int) -> dict:
104104
if graph_instance is None:
105105
raise ValueError("graph instance is required for concurrent execution")
106106

107+
# Assign depth level to the graph
108+
if "graph_depth" in graph_instance.config:
109+
graph_instance.config["graph_depth"] += 1
110+
else:
111+
graph_instance.config["graph_depth"] = 1
112+
113+
# Check if max depth is reached
114+
if "max_depth" in graph_instance.config and \
115+
graph_instance.config["graph_depth"] > graph_instance.config["max_depth"]:
116+
print("Max search depth is reached. Terminating search")
117+
state.update({"skip_branch": "True"})
118+
return state
119+
107120
# sets the prompt for the graph instance
108121
graph_instance.prompt = user_prompt
109122

0 commit comments

Comments
 (0)