Skip to content

Commit 4b371f4

Browse files
feat: add deep scraper implementation
Co-Authored-By: Matteo Vedovati <[email protected]>
1 parent 17c5145 commit 4b371f4

File tree

3 files changed

+57
-14
lines changed

3 files changed

+57
-14
lines changed

examples/openai/fetch_multiple_links.py renamed to examples/openai/depth_search_graph_openai.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
1+
"""
2+
depth_search_graph_opeani example
3+
"""
24
from scrapegraphai.graphs import DepthSearchGraph
35

46
graph_config = {
@@ -19,4 +21,4 @@
1921
)
2022

2123
result = search_graph.run()
22-
print(result)
24+
print(result)

scrapegraphai/graphs/depth_search_graph.py

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,18 @@
99
from ..utils.save_code_to_file import save_code_to_file
1010
from ..nodes import (
1111
FetchNodeLevelK,
12-
ParseNodeDepthK
12+
ParseNodeDepthK,
13+
DescriptionNode,
14+
RAGNode,
15+
GenerateAnswerNodeKLevel
1316
)
1417

1518
class DepthSearchGraph(AbstractGraph):
1619
"""
17-
CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for
18-
extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
20+
CodeGeneratorGraph is a script generator pipeline that generates
21+
the function extract_data(html: str) -> dict() for
22+
extracting the wanted information from a HTML page. The
23+
code generated is in Python and uses the library BeautifulSoup.
1924
It requires a user prompt, a source URL, and an output schema.
2025
2126
Attributes:
@@ -60,7 +65,7 @@ def _create_graph(self) -> BaseGraph:
6065
BaseGraph: A graph instance representing the web scraping workflow.
6166
"""
6267

63-
fetch_node = FetchNodeLevelK(
68+
fetch_node_k = FetchNodeLevelK(
6469
input="url| local_dir",
6570
output=["docs"],
6671
node_config={
@@ -72,24 +77,61 @@ def _create_graph(self) -> BaseGraph:
7277
"only_inside_links": self.config.get("only_inside_links", False)
7378
}
7479
)
75-
76-
parse_node = ParseNodeDepthK(
80+
81+
parse_node_k = ParseNodeDepthK(
7782
input="docs",
7883
output=["docs"],
7984
node_config={
8085
"verbose": self.config.get("verbose", False)
8186
}
8287
)
8388

89+
description_node = DescriptionNode(
90+
input="docs",
91+
output=["docs"],
92+
node_config={
93+
"llm_model": self.llm_model,
94+
"verbose": self.config.get("verbose", False),
95+
"cache_path": self.config.get("cache_path", False)
96+
}
97+
)
98+
99+
rag_node = RAGNode (
100+
input="docs",
101+
output=["vectorial_db"],
102+
node_config={
103+
"llm_model": self.llm_model,
104+
"embedder_model": self.config.get("embedder_model", False),
105+
"verbose": self.config.get("verbose", False),
106+
}
107+
)
108+
109+
generate_answer_k = GenerateAnswerNodeKLevel(
110+
input="vectorial_db",
111+
output=["answer"],
112+
node_config={
113+
"llm_model": self.llm_model,
114+
"embedder_model": self.config.get("embedder_model", False),
115+
"verbose": self.config.get("verbose", False),
116+
}
117+
118+
)
119+
84120
return BaseGraph(
85121
nodes=[
86-
fetch_node,
87-
parse_node
122+
fetch_node_k,
123+
parse_node_k,
124+
description_node,
125+
rag_node,
126+
generate_answer_k
88127
],
89128
edges=[
90-
(fetch_node, parse_node),
129+
(fetch_node_k, parse_node_k),
130+
(parse_node_k, description_node),
131+
(description_node, rag_node),
132+
(rag_node, generate_answer_k)
91133
],
92-
entry_point=fetch_node,
134+
entry_point=fetch_node_k,
93135
graph_name=self.__class__.__name__
94136
)
95137

scrapegraphai/nodes/description_node.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,11 @@ def __init__(
3131
input: str,
3232
output: List[str],
3333
node_config: Optional[dict] = None,
34-
node_name: str = "RAG",
34+
node_name: str = "DESCRIPTION",
3535
):
3636
super().__init__(node_name, "node", input, output, 2, node_config)
3737

3838
self.llm_model = node_config["llm_model"]
39-
self.embedder_model = node_config.get("embedder_model", None)
4039
self.verbose = (
4140
False if node_config is None else node_config.get("verbose", False)
4241
)

0 commit comments

Comments
 (0)