Skip to content

Commit 80053a2

Browse files
committed
Merge branch 'pre/beta' of https://github.com/VinciGit00/Scrapegraph-ai into pre/beta
2 parents 389b52a + 532adb6 commit 80053a2

25 files changed

+401
-80
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## [0.9.0-beta.5](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.9.0-beta.4...v0.9.0-beta.5) (2024-05-06)
2+
3+
4+
### Features
5+
6+
* fixed custom_graphs example and robots_node ([84fcb44](https://github.com/VinciGit00/Scrapegraph-ai/commit/84fcb44aaa36e84f775884138d04f4a60bb389be))
7+
* multiple graph instances ([dbb614a](https://github.com/VinciGit00/Scrapegraph-ai/commit/dbb614a8dd88d7667fe3daaf0263f5d6e9be1683))
8+
* **node:** multiple url search in SearchGraph + fixes ([930adb3](https://github.com/VinciGit00/Scrapegraph-ai/commit/930adb38f2154ba225342466bfd1846c47df72a0))
9+
110
## [0.9.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.9.0-beta.3...v0.9.0-beta.4) (2024-05-05)
211

312

examples/openai/custom_graph_openai.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
import os
66
from dotenv import load_dotenv
7+
8+
from langchain_openai import OpenAIEmbeddings
79
from scrapegraphai.models import OpenAI
810
from scrapegraphai.graphs import BaseGraph
911
from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode
@@ -20,7 +22,7 @@
2022
"api_key": openai_key,
2123
"model": "gpt-3.5-turbo",
2224
"temperature": 0,
23-
"streaming": True
25+
"streaming": False
2426
},
2527
}
2628

@@ -29,33 +31,50 @@
2931
# ************************************************
3032

3133
llm_model = OpenAI(graph_config["llm"])
34+
embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key)
3235

3336
# define the nodes for the graph
3437
robot_node = RobotsNode(
3538
input="url",
3639
output=["is_scrapable"],
37-
node_config={"llm_model": llm_model}
40+
node_config={
41+
"llm_model": llm_model,
42+
"verbose": True,
43+
}
3844
)
3945

4046
fetch_node = FetchNode(
4147
input="url | local_dir",
4248
output=["doc"],
43-
node_config={"headless": True, "verbose": True}
49+
node_config={
50+
"verbose": True,
51+
"headless": True,
52+
}
4453
)
4554
parse_node = ParseNode(
4655
input="doc",
4756
output=["parsed_doc"],
48-
node_config={"chunk_size": 4096}
57+
node_config={
58+
"chunk_size": 4096,
59+
"verbose": True,
60+
}
4961
)
5062
rag_node = RAGNode(
5163
input="user_prompt & (parsed_doc | doc)",
5264
output=["relevant_chunks"],
53-
node_config={"llm_model": llm_model},
65+
node_config={
66+
"llm_model": llm_model,
67+
"embedder_model": embedder,
68+
"verbose": True,
69+
}
5470
)
5571
generate_answer_node = GenerateAnswerNode(
5672
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
5773
output=["answer"],
58-
node_config={"llm_model": llm_model},
74+
node_config={
75+
"llm_model": llm_model,
76+
"verbose": True,
77+
}
5978
)
6079

6180
# ************************************************

examples/openai/search_graph_multi.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
Example of custom graph using existing nodes
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from langchain_openai import OpenAIEmbeddings
8+
from scrapegraphai.models import OpenAI
9+
from scrapegraphai.graphs import BaseGraph, SmartScraperGraph
10+
from scrapegraphai.nodes import SearchInternetNode, GraphIteratorNode, MergeAnswersNode
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
openai_key = os.getenv("OPENAI_APIKEY")
18+
19+
graph_config = {
20+
"llm": {
21+
"api_key": openai_key,
22+
"model": "gpt-3.5-turbo",
23+
},
24+
}
25+
26+
# ************************************************
27+
# Create a SmartScraperGraph instance
28+
# ************************************************
29+
30+
smart_scraper_graph = SmartScraperGraph(
31+
prompt="",
32+
source="",
33+
config=graph_config
34+
)
35+
36+
# ************************************************
37+
# Define the graph nodes
38+
# ************************************************
39+
40+
llm_model = OpenAI(graph_config["llm"])
41+
embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key)
42+
43+
search_internet_node = SearchInternetNode(
44+
input="user_prompt",
45+
output=["urls"],
46+
node_config={
47+
"llm_model": llm_model,
48+
"max_results": 5, # num of search results to fetch
49+
"verbose": True,
50+
}
51+
)
52+
53+
graph_iterator_node = GraphIteratorNode(
54+
input="user_prompt & urls",
55+
output=["results"],
56+
node_config={
57+
"graph_instance": smart_scraper_graph,
58+
"verbose": True,
59+
}
60+
)
61+
62+
merge_answers_node = MergeAnswersNode(
63+
input="user_prompt & results",
64+
output=["answer"],
65+
node_config={
66+
"llm_model": llm_model,
67+
"verbose": True,
68+
}
69+
)
70+
71+
# ************************************************
72+
# Create the graph by defining the connections
73+
# ************************************************
74+
75+
graph = BaseGraph(
76+
nodes=[
77+
search_internet_node,
78+
graph_iterator_node,
79+
merge_answers_node
80+
],
81+
edges=[
82+
(search_internet_node, graph_iterator_node),
83+
(graph_iterator_node, merge_answers_node)
84+
],
85+
entry_point=search_internet_node
86+
)
87+
88+
# ************************************************
89+
# Execute the graph
90+
# ************************************************
91+
92+
result, execution_info = graph.execute({
93+
"user_prompt": "List me all the typical Chioggia dishes."
94+
})
95+
96+
# get the answer from the result
97+
result = result.get("answer", "No answer found.")
98+
print(result)

examples/openai/search_graph_openai.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,16 @@
1919
"api_key": openai_key,
2020
"model": "gpt-3.5-turbo",
2121
},
22+
"max_results": 5,
23+
"verbose": True,
2224
}
2325

2426
# ************************************************
2527
# Create the SearchGraph instance and run it
2628
# ************************************************
2729

2830
search_graph = SearchGraph(
29-
prompt="List me top 5 eyeliner products for a gift.",
31+
prompt="List me the best escursions near Trento",
3032
config=graph_config
3133
)
3234

examples/openai/smart_scraper_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"api_key": openai_key,
2222
"model": "gpt-3.5-turbo",
2323
},
24-
"verbose": True,
24+
"verbose": False,
2525
}
2626

2727
# ************************************************

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[tool.poetry]
22
name = "scrapegraphai"
33

4-
version = "0.9.0b4"
4+
version = "0.9.0b5"
55

66
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
77
authors = [

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
__init__.py file for graphs folder
33
"""
44

5+
from .abstract_graph import AbstractGraph
56
from .base_graph import BaseGraph
67
from .smart_scraper_graph import SmartScraperGraph
78
from .speech_graph import SpeechGraph

scrapegraphai/graphs/abstract_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
5656
self.execution_info = None
5757

5858
# Set common configuration parameters
59-
self.verbose = True if config is None else config.get("verbose", False)
59+
self.verbose = False if config is None else config.get("verbose", False)
6060
self.headless = True if config is None else config.get(
6161
"headless", True)
6262
common_params = {"headless": self.headless,

scrapegraphai/graphs/search_graph.py

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@
55
from .base_graph import BaseGraph
66
from ..nodes import (
77
SearchInternetNode,
8-
FetchNode,
9-
ParseNode,
10-
RAGNode,
11-
GenerateAnswerNode
8+
GraphIteratorNode,
9+
MergeAnswersNode
1210
)
1311
from .abstract_graph import AbstractGraph
12+
from .smart_scraper_graph import SmartScraperGraph
1413

1514

1615
class SearchGraph(AbstractGraph):
@@ -38,6 +37,11 @@ class SearchGraph(AbstractGraph):
3837
>>> result = search_graph.run()
3938
"""
4039

40+
def __init__(self, prompt: str, config: dict):
41+
42+
self.max_results = config.get("max_results", 3)
43+
super().__init__(prompt, config)
44+
4145
def _create_graph(self) -> BaseGraph:
4246
"""
4347
Creates the graph of nodes representing the workflow for web scraping and searching.
@@ -46,53 +50,53 @@ def _create_graph(self) -> BaseGraph:
4650
BaseGraph: A graph instance representing the web scraping and searching workflow.
4751
"""
4852

53+
# ************************************************
54+
# Create a SmartScraperGraph instance
55+
# ************************************************
56+
57+
smart_scraper_instance = SmartScraperGraph(
58+
prompt="",
59+
source="",
60+
config=self.config
61+
)
62+
63+
# ************************************************
64+
# Define the graph nodes
65+
# ************************************************
66+
4967
search_internet_node = SearchInternetNode(
5068
input="user_prompt",
51-
output=["url"],
52-
node_config={
53-
"llm_model": self.llm_model
54-
}
55-
)
56-
fetch_node = FetchNode(
57-
input="url | local_dir",
58-
output=["doc"]
59-
)
60-
parse_node = ParseNode(
61-
input="doc",
62-
output=["parsed_doc"],
69+
output=["urls"],
6370
node_config={
64-
"chunk_size": self.model_token
71+
"llm_model": self.llm_model,
72+
"max_results": self.max_results
6573
}
6674
)
67-
rag_node = RAGNode(
68-
input="user_prompt & (parsed_doc | doc)",
69-
output=["relevant_chunks"],
75+
graph_iterator_node = GraphIteratorNode(
76+
input="user_prompt & urls",
77+
output=["results"],
7078
node_config={
71-
"llm_model": self.llm_model,
72-
"embedder_model": self.embedder_model
79+
"graph_instance": smart_scraper_instance,
7380
}
7481
)
75-
generate_answer_node = GenerateAnswerNode(
76-
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
82+
83+
merge_answers_node = MergeAnswersNode(
84+
input="user_prompt & results",
7785
output=["answer"],
7886
node_config={
79-
"llm_model": self.llm_model
87+
"llm_model": self.llm_model,
8088
}
8189
)
8290

8391
return BaseGraph(
8492
nodes=[
8593
search_internet_node,
86-
fetch_node,
87-
parse_node,
88-
rag_node,
89-
generate_answer_node,
94+
graph_iterator_node,
95+
merge_answers_node
9096
],
9197
edges=[
92-
(search_internet_node, fetch_node),
93-
(fetch_node, parse_node),
94-
(parse_node, rag_node),
95-
(rag_node, generate_answer_node)
98+
(search_internet_node, graph_iterator_node),
99+
(graph_iterator_node, merge_answers_node)
96100
],
97101
entry_point=search_internet_node
98102
)

scrapegraphai/nodes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,5 @@
1717
from .robots_node import RobotsNode
1818
from .generate_answer_csv_node import GenerateAnswerCSVNode
1919
from .generate_answer_pdf_node import GenerateAnswerPDFNode
20+
from .graph_iterator_node import GraphIteratorNode
21+
from .merge_answers_node import MergeAnswersNode

scrapegraphai/nodes/generate_answer_csv_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Module for generating the answer node
33
"""
44
# Imports from standard library
5-
from typing import List
5+
from typing import List, Optional
66
from tqdm import tqdm
77

88
# Imports from Langchain
@@ -39,7 +39,7 @@ class GenerateAnswerCSVNode(BaseNode):
3939
updating the state with the generated answer under the 'answer' key.
4040
"""
4141

42-
def __init__(self, input: str, output: List[str], node_config: dict,
42+
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None,
4343
node_name: str = "GenerateAnswer"):
4444
"""
4545
Initializes the GenerateAnswerNodeCsv with a language model client and a node name.

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
# Imports from standard library
6-
from typing import List
6+
from typing import List, Optional
77
from tqdm import tqdm
88

99
# Imports from Langchain
@@ -33,7 +33,7 @@ class GenerateAnswerNode(BaseNode):
3333
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
3434
"""
3535

36-
def __init__(self, input: str, output: List[str], node_config: dict,
36+
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None,
3737
node_name: str = "GenerateAnswer"):
3838
super().__init__(node_name, "node", input, output, 2, node_config)
3939

0 commit comments

Comments
 (0)