Skip to content

Commit fc58e2d

Browse files
committed
feat(smart-scraper-multi): add schema to graphs and created SmartScraperMultiGraph
1 parent 5701afe commit fc58e2d

35 files changed

+402
-173
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
"""
2+
Example of custom graph for creating a knowledge graph
3+
"""
4+
5+
import os, json
6+
from dotenv import load_dotenv
7+
8+
from langchain_openai import OpenAIEmbeddings
9+
from scrapegraphai.models import OpenAI
10+
from scrapegraphai.graphs import BaseGraph, SmartScraperGraph
11+
from scrapegraphai.nodes import GraphIteratorNode, MergeAnswersNode, KnowledgeGraphNode
12+
13+
load_dotenv()
14+
15+
# ************************************************
16+
# Define the output schema
17+
# ************************************************
18+
19+
schema= """{
20+
"Job Postings": {
21+
"Company x": [
22+
{
23+
"title": "...",
24+
"description": "...",
25+
"location": "...",
26+
"date_posted": "..",
27+
"requirements": ["...", "...", "..."]
28+
},
29+
{
30+
"title": "...",
31+
"description": "...",
32+
"location": "...",
33+
"date_posted": "..",
34+
"requirements": ["...", "...", "..."]
35+
}
36+
],
37+
"Company y": [
38+
{
39+
"title": "...",
40+
"description": "...",
41+
"location": "...",
42+
"date_posted": "..",
43+
"requirements": ["...", "...", "..."]
44+
}
45+
]
46+
}
47+
}"""
48+
49+
# ************************************************
50+
# Define the configuration for the graph
51+
# ************************************************
52+
53+
openai_key = os.getenv("OPENAI_APIKEY")
54+
55+
graph_config = {
56+
"llm": {
57+
"api_key": openai_key,
58+
"model": "gpt-4o",
59+
},
60+
"verbose": True,
61+
"headless": False,
62+
}
63+
64+
# ************************************************
65+
# Define the graph nodes
66+
# ************************************************
67+
68+
llm_model = OpenAI(graph_config["llm"])
69+
embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key)
70+
71+
smart_scraper_instance = SmartScraperGraph(
72+
prompt="",
73+
source="",
74+
config=graph_config,
75+
)
76+
77+
# ************************************************
78+
# Define the graph nodes
79+
# ************************************************
80+
81+
graph_iterator_node = GraphIteratorNode(
82+
input="user_prompt & urls",
83+
output=["results"],
84+
node_config={
85+
"graph_instance": smart_scraper_instance,
86+
}
87+
)
88+
89+
merge_answers_node = MergeAnswersNode(
90+
input="user_prompt & results",
91+
output=["answer"],
92+
node_config={
93+
"llm_model": llm_model,
94+
"schema": schema
95+
}
96+
)
97+
98+
knowledge_graph_node = KnowledgeGraphNode(
99+
input="user_prompt & answer",
100+
output=["kg"],
101+
node_config={
102+
"llm_model": llm_model,
103+
}
104+
)
105+
106+
graph = BaseGraph(
107+
nodes=[
108+
graph_iterator_node,
109+
merge_answers_node,
110+
knowledge_graph_node
111+
],
112+
edges=[
113+
(graph_iterator_node, merge_answers_node),
114+
(merge_answers_node, knowledge_graph_node)
115+
],
116+
entry_point=graph_iterator_node
117+
)
118+
119+
# ************************************************
120+
# Execute the graph
121+
# ************************************************
122+
123+
result, execution_info = graph.execute({
124+
"user_prompt": "List me all the Machine Learning Engineer job postings",
125+
"urls": [
126+
"https://www.linkedin.com/jobs/machine-learning-engineer-offerte-di-lavoro/?currentJobId=3889037104&originalSubdomain=it",
127+
"https://www.glassdoor.com/Job/italy-machine-learning-engineer-jobs-SRCH_IL.0,5_IN120_KO6,31.html",
128+
"https://it.indeed.com/jobs?q=ML+engineer&vjk=3c2e6d27601ffaaa"
129+
],
130+
})
131+
132+
# get the answer from the result
133+
result = result.get("answer", "No answer found.")
134+
print(json.dumps(result, indent=4))

examples/openai/custom_graph_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646

4747
fetch_node = FetchNode(
4848
input="url | local_dir",
49-
output=["doc"],
49+
output=["doc", "link_urls", "img_urls"],
5050
node_config={
5151
"verbose": True,
5252
"headless": True,

examples/openai/multiple_search_openai.py

Lines changed: 0 additions & 79 deletions
This file was deleted.

examples/openai/omni_scraper_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
graph_config = {
2020
"llm": {
2121
"api_key": openai_key,
22-
"model": "gpt-4-turbo",
22+
"model": "gpt-4o",
2323
},
2424
"verbose": True,
2525
"headless": True,

examples/openai/omni_search_graph_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"model": "gpt-4o",
2121
},
2222
"max_results": 2,
23-
"max_images": 5,
23+
"max_images": 1,
2424
"verbose": True,
2525
}
2626

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os, json
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperMultiGraph
8+
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Define the configuration for the graph
13+
# ************************************************
14+
15+
openai_key = os.getenv("OPENAI_APIKEY")
16+
17+
graph_config = {
18+
"llm": {
19+
"api_key": openai_key,
20+
"model": "gpt-4o",
21+
},
22+
"verbose": True,
23+
"headless": False,
24+
}
25+
26+
# *******************************************************
27+
# Create the SmartScraperMultiGraph instance and run it
28+
# *******************************************************
29+
30+
multiple_search_graph = SmartScraperMultiGraph(
31+
prompt="Who is Marco Perini?",
32+
source= [
33+
"https://perinim.github.io/",
34+
"https://perinim.github.io/cv/"
35+
],
36+
schema=None,
37+
config=graph_config
38+
)
39+
40+
result = multiple_search_graph.run()
41+
print(json.dumps(result, indent=4))
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os, json
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Define the output schema for the graph
13+
# ************************************************
14+
15+
schema= """
16+
{
17+
"Projects": [
18+
"Project #":
19+
{
20+
"title": "...",
21+
"description": "...",
22+
},
23+
"Project #":
24+
{
25+
"title": "...",
26+
"description": "...",
27+
}
28+
]
29+
}
30+
"""
31+
32+
# ************************************************
33+
# Define the configuration for the graph
34+
# ************************************************
35+
36+
openai_key = os.getenv("OPENAI_APIKEY")
37+
38+
graph_config = {
39+
"llm": {
40+
"api_key":openai_key,
41+
"model": "gpt-3.5-turbo",
42+
},
43+
"verbose": True,
44+
"headless": False,
45+
}
46+
47+
# ************************************************
48+
# Create the SmartScraperGraph instance and run it
49+
# ************************************************
50+
51+
smart_scraper_graph = SmartScraperGraph(
52+
prompt="List me all the projects with their description",
53+
source="https://perinim.github.io/projects/",
54+
schema=schema,
55+
config=graph_config
56+
)
57+
58+
result = smart_scraper_graph.run()
59+
print(json.dumps(result, indent=4))

requirements-dev.lock

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ certifi==2024.2.2
4545
# via requests
4646
charset-normalizer==3.3.2
4747
# via requests
48+
colorama==0.4.6
49+
# via ipython
50+
# via pytest
51+
# via tqdm
4852
dataclasses-json==0.6.6
4953
# via langchain
5054
# via langchain-community
@@ -100,6 +104,7 @@ graphviz==0.20.3
100104
# via scrapegraphai
101105
greenlet==3.0.3
102106
# via playwright
107+
# via sqlalchemy
103108
groq==0.5.0
104109
# via langchain-groq
105110
grpcio==1.63.0
@@ -212,8 +217,6 @@ pandas==2.2.2
212217
# via scrapegraphai
213218
parso==0.8.4
214219
# via jedi
215-
pexpect==4.9.0
216-
# via ipython
217220
playwright==1.43.0
218221
# via scrapegraphai
219222
pluggy==1.5.0
@@ -230,8 +233,6 @@ protobuf==4.25.3
230233
# via googleapis-common-protos
231234
# via grpcio-status
232235
# via proto-plus
233-
ptyprocess==0.7.0
234-
# via pexpect
235236
pure-eval==0.2.2
236237
# via stack-data
237238
pyasn1==0.6.0

0 commit comments

Comments
 (0)