Skip to content

Commit ec8cbca

Browse files
authored
Merge branch 'pre/beta' into try
2 parents 7fd06a6 + 191db0b commit ec8cbca

File tree

6 files changed

+78
-35
lines changed

6 files changed

+78
-35
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
## [1.3.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.2.4...v1.3.0) (2024-05-19)
22

33

4+
45
### Features
56

67
* add new model ([8c7afa7](https://github.com/VinciGit00/Scrapegraph-ai/commit/8c7afa7570f0a104578deb35658168435cfe5ae1))
78

9+
810
## [1.2.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.2.3...v1.2.4) (2024-05-17)
911

1012

examples/openai/deep_scraper_openai.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"model": "gpt-4",
2323
},
2424
"verbose": True,
25+
"max_depth": 1
2526
}
2627

2728
# ************************************************

scrapegraphai/graphs/base_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,4 +145,4 @@ def execute(self, initial_state: dict) -> Tuple[dict, list]:
145145
"exec_time": total_exec_time,
146146
})
147147

148-
return state, exec_info
148+
return state, exec_info

scrapegraphai/graphs/deep_scraper_graph.py

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
SearchLinkNode,
99
ParseNode,
1010
RAGNode,
11-
GenerateAnswerNode
11+
GenerateAnswerNode,
12+
GraphIteratorNode,
13+
MergeAnswersNode
1214
)
1315
from .abstract_graph import AbstractGraph
1416

@@ -18,12 +20,11 @@ class DeepScraperGraph(AbstractGraph):
1820
[WIP]
1921
2022
DeepScraper is a scraping pipeline that automates the process of
21-
extracting information from web pages
22-
using a natural language model to interpret and answer prompts.
23-
24-
Unlike SmartScraper, DeepScraper can navigate to the links within the input webpage,
25-
to fuflfil the task within the prompt.
23+
extracting information from web pages using a natural language model
24+
to interpret and answer prompts.
2625
26+
Unlike SmartScraper, DeepScraper can navigate to the links within,
27+
the input webpage to fuflfil the task within the prompt.
2728
2829
Attributes:
2930
prompt (str): The prompt for the graph.
@@ -50,12 +51,13 @@ class DeepScraperGraph(AbstractGraph):
5051

5152
def __init__(self, prompt: str, source: str, config: dict):
5253
super().__init__(prompt, config, source)
53-
5454
self.input_key = "url" if source.startswith("http") else "local_dir"
5555

56-
def _create_graph(self) -> BaseGraph:
56+
def _create_repeated_graph(self) -> BaseGraph:
5757
"""
58-
Creates the graph of nodes representing the workflow for web scraping.
58+
Creates the graph that can be repeatedly executed to conduct search on
59+
hyperlinks within the webpage.
60+
5961
Returns:
6062
BaseGraph: A graph instance representing the web scraping workflow.
6163
"""
@@ -78,6 +80,13 @@ def _create_graph(self) -> BaseGraph:
7880
"embedder_model": self.embedder_model
7981
}
8082
)
83+
generate_answer_node = GenerateAnswerNode(
84+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
85+
output=["answer"],
86+
node_config={
87+
"llm_model": self.llm_model
88+
}
89+
)
8190
search_node = SearchLinkNode(
8291
input="user_prompt & relevant_chunks",
8392
output=["relevant_links"],
@@ -86,23 +95,60 @@ def _create_graph(self) -> BaseGraph:
8695
"embedder_model": self.embedder_model
8796
}
8897
)
98+
graph_iterator_node = GraphIteratorNode(
99+
input="user_prompt & relevant_links",
100+
output=["results"],
101+
node_config={
102+
"graph_instance": None,
103+
"batchsize": 1
104+
}
105+
)
106+
merge_answers_node = MergeAnswersNode(
107+
input="user_prompt & results",
108+
output=["answer"],
109+
node_config={
110+
"llm_model": self.llm_model,
111+
}
112+
)
89113

90114
return BaseGraph(
91115
nodes=[
92116
fetch_node,
93117
parse_node,
94118
rag_node,
95-
search_node
119+
generate_answer_node,
120+
search_node,
121+
graph_iterator_node,
122+
merge_answers_node
96123
],
97124
edges=[
98125
(fetch_node, parse_node),
99126
(parse_node, rag_node),
100-
(rag_node, search_node)
101-
127+
(rag_node, generate_answer_node),
128+
(rag_node, search_node),
129+
(search_node, graph_iterator_node),
130+
(graph_iterator_node, merge_answers_node)
102131
],
103132
entry_point=fetch_node
104133
)
105134

135+
136+
137+
def _create_graph(self) -> BaseGraph:
138+
"""
139+
Creates the graph of nodes representing the workflow for web scraping
140+
n-levels deep.
141+
142+
Returns:
143+
BaseGraph: A graph instance representing the web scraping workflow.
144+
"""
145+
146+
base_graph = self._create_repeated_graph()
147+
graph_iterator_node = list(filter(lambda x: x.node_name == "GraphIterator", base_graph.nodes))[0]
148+
# Graph iterator will repeat the same graph for multiple hyperlinks found within input webpage
149+
graph_iterator_node.node_config["graph_instance"] = self
150+
return base_graph
151+
106152
def run(self) -> str:
107153
"""
108154
Executes the scraping process and returns the answer to the prompt.

scrapegraphai/nodes/conditional_node.py

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,46 +13,33 @@ class ConditionalNode(BaseNode):
1313
This node type is used to implement branching logic within the graph, allowing
1414
for dynamic paths based on the data available in the current state.
1515
16+
It is expected thar exactly two edges are created out of this node.
17+
The first node is chosen for execution if the key exists and has a non-empty value,
18+
and the second node is chosen if the key does not exist or is empty.
19+
1620
Attributes:
1721
key_name (str): The name of the key in the state to check for its presence.
18-
next_nodes (list): A list of two node instances. The first node is chosen
19-
for execution if the key exists and has a non-empty value,
20-
and the second node is chosen if the key does not exist or
21-
is empty.
2222
2323
Args:
2424
key_name (str): The name of the key to check in the graph's state. This is
2525
used to determine the path the graph's execution should take.
26-
next_nodes (list): A list containing exactly two node instances, specifying
27-
the next nodes to execute based on the condition's outcome.
2826
node_name (str, optional): The unique identifier name for the node. Defaults
2927
to "ConditionalNode".
3028
31-
Raises:
32-
ValueError: If next_nodes does not contain exactly two elements, indicating
33-
a misconfiguration in specifying the conditional paths.
3429
"""
3530

36-
def __init__(self, key_name: str, next_nodes: list, node_name="ConditionalNode"):
31+
def __init__(self, key_name: str, node_name="ConditionalNode"):
3732
"""
3833
Initializes the node with the key to check and the next node names based on the condition.
3934
4035
Args:
4136
key_name (str): The name of the key to check in the state.
42-
next_nodes (list): A list containing exactly two names of the next nodes.
43-
The first is used if the key exists, the second if it does not.
44-
45-
Raises:
46-
ValueError: If next_nodes does not contain exactly two elements.
4737
"""
4838

4939
super().__init__(node_name, "conditional_node")
5040
self.key_name = key_name
51-
if len(next_nodes) != 2:
52-
raise ValueError("next_nodes must contain exactly two elements.")
53-
self.next_nodes = next_nodes
5441

55-
def execute(self, state: dict) -> str:
42+
def execute(self, state: dict) -> dict:
5643
"""
5744
Checks if the specified key is present in the state and decides the next node accordingly.
5845
@@ -64,5 +51,7 @@ def execute(self, state: dict) -> str:
6451
"""
6552

6653
if self.key_name in state and len(state[self.key_name]) > 0:
67-
return self.next_nodes[0].node_name
68-
return self.next_nodes[1].node_name
54+
state["next_node"] = 0
55+
else:
56+
state["next_node"] = 1
57+
return state

scrapegraphai/nodes/graph_iterator_node.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,12 @@ async def _async_execute(self, state: dict, batchsize: int) -> dict:
104104
if graph_instance is None:
105105
raise ValueError("graph instance is required for concurrent execution")
106106

107-
# sets the prompt for the graph instance
107+
# Assign depth level to the graph
108+
if "graph_depth" in graph_instance.config:
109+
graph_instance.config["graph_depth"] += 1
110+
else:
111+
graph_instance.config["graph_depth"] = 1
112+
108113
graph_instance.prompt = user_prompt
109114

110115
participants = []

0 commit comments

Comments
 (0)