Skip to content

Commit 9b78e2d

Browse files
committed
Merge branch 'pre/beta' of https://github.com/shenghongtw/Scrapegraph-ai into pre/beta
2 parents 464b8b0 + 9266a36 commit 9b78e2d

File tree

9 files changed

+145
-61
lines changed

9 files changed

+145
-61
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
## [1.27.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6-beta.1...v1.27.0-beta.1) (2024-10-16)
2+
3+
4+
### Features
5+
6+
* add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3))
7+
18
## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14)
29

310

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraperMultiConcatGraph with Groq
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
graph_config = {
17+
"llm": {
18+
"api_key": os.getenv("GROQ_APIKEY"),
19+
"model": "groq/gemma-7b-it",
20+
},
21+
"verbose": True,
22+
"headless": True,
23+
"reattempt": True #Setting this to True will allow the graph to reattempt the scraping process
24+
}
25+
26+
# *******************************************************
27+
# Create the SmartScraperMultiCondGraph instance and run it
28+
# *******************************************************
29+
30+
multiple_search_graph = SmartScraperGraph(
31+
prompt="Who is Marco Perini?",
32+
source="https://perinim.github.io/",
33+
schema=None,
34+
config=graph_config
35+
)
36+
37+
result = multiple_search_graph.run()
38+
print(json.dumps(result, indent=4))

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[project]
22
name = "scrapegraphai"
33

4-
version = "1.26.6b1"
4+
version = "1.27.0b1"
55

66
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
77
authors = [
@@ -38,7 +38,8 @@ dependencies = [
3838
"async-timeout>=4.0.3",
3939
"transformers>=4.44.2",
4040
"googlesearch-python>=1.2.5",
41-
"simpleeval>=1.0.0"
41+
"simpleeval>=1.0.0",
42+
"async_timeout>=4.0.3"
4243
]
4344

4445
license = "MIT"

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@ undetected-playwright>=0.3.0
1818
semchunk>=1.0.1
1919
langchain-ollama>=0.1.3
2020
simpleeval>=0.9.13
21-
googlesearch-python>=1.2.5
21+
googlesearch-python>=1.2.5
22+
async_timeout>=4.0.3

scrapegraphai/graphs/base_graph.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,10 @@ def _set_conditional_node_edges(self):
9595
raise ValueError(f"ConditionalNode '{node.node_name}' must have exactly two outgoing edges.")
9696
# Assign true_node_name and false_node_name
9797
node.true_node_name = outgoing_edges[0][1].node_name
98-
node.false_node_name = outgoing_edges[1][1].node_name
98+
try:
99+
node.false_node_name = outgoing_edges[1][1].node_name
100+
except:
101+
node.false_node_name = None
99102

100103
def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
101104
"""
@@ -221,6 +224,8 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
221224
node_names = {node.node_name for node in self.nodes}
222225
if result in node_names:
223226
current_node_name = result
227+
elif result is None:
228+
current_node_name = None
224229
else:
225230
raise ValueError(f"Conditional Node returned a node name '{result}' that does not exist in the graph")
226231

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 83 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,17 @@
22
SmartScraperGraph Module
33
"""
44
from typing import Optional
5-
import logging
65
from pydantic import BaseModel
76
from .base_graph import BaseGraph
87
from .abstract_graph import AbstractGraph
98
from ..nodes import (
109
FetchNode,
1110
ParseNode,
1211
ReasoningNode,
13-
GenerateAnswerNode
12+
GenerateAnswerNode,
13+
ConditionalNode
1414
)
15+
from ..prompts import REGEN_ADDITIONAL_INFO
1516

1617
class SmartScraperGraph(AbstractGraph):
1718
"""
@@ -89,6 +90,28 @@ def _create_graph(self) -> BaseGraph:
8990
}
9091
)
9192

93+
cond_node = None
94+
regen_node = None
95+
if self.config.get("reattempt") is True:
96+
cond_node = ConditionalNode(
97+
input="answer",
98+
output=["answer"],
99+
node_name="ConditionalNode",
100+
node_config={
101+
"key_name": "answer",
102+
"condition": 'not answer or answer=="NA"',
103+
}
104+
)
105+
regen_node = GenerateAnswerNode(
106+
input="user_prompt & answer",
107+
output=["answer"],
108+
node_config={
109+
"llm_model": self.llm_model,
110+
"additional_info": REGEN_ADDITIONAL_INFO,
111+
"schema": self.schema,
112+
}
113+
)
114+
92115
if self.config.get("html_mode") is False:
93116
parse_node = ParseNode(
94117
input="doc",
@@ -99,6 +122,7 @@ def _create_graph(self) -> BaseGraph:
99122
}
100123
)
101124

125+
reasoning_node = None
102126
if self.config.get("reasoning"):
103127
reasoning_node = ReasoningNode(
104128
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
@@ -109,68 +133,72 @@ def _create_graph(self) -> BaseGraph:
109133
"schema": self.schema,
110134
}
111135
)
136+
137+
# Define the graph variation configurations
138+
# (html_mode, reasoning, reattempt)
139+
graph_variation_config = {
140+
(False, True, False): {
141+
"nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node],
142+
"edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node)]
143+
},
144+
(True, True, False): {
145+
"nodes": [fetch_node, reasoning_node, generate_answer_node],
146+
"edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node)]
147+
},
148+
(True, False, False): {
149+
"nodes": [fetch_node, generate_answer_node],
150+
"edges": [(fetch_node, generate_answer_node)]
151+
},
152+
(False, False, False): {
153+
"nodes": [fetch_node, parse_node, generate_answer_node],
154+
"edges": [(fetch_node, parse_node), (parse_node, generate_answer_node)]
155+
},
156+
(False, True, True): {
157+
"nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node, cond_node, regen_node],
158+
"edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node),
159+
(generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)]
160+
},
161+
(True, True, True): {
162+
"nodes": [fetch_node, reasoning_node, generate_answer_node, cond_node, regen_node],
163+
"edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node),
164+
(generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)]
165+
},
166+
(True, False, True): {
167+
"nodes": [fetch_node, generate_answer_node, cond_node, regen_node],
168+
"edges": [(fetch_node, generate_answer_node), (generate_answer_node, cond_node),
169+
(cond_node, regen_node), (cond_node, None)]
170+
},
171+
(False, False, True): {
172+
"nodes": [fetch_node, parse_node, generate_answer_node, cond_node, regen_node],
173+
"edges": [(fetch_node, parse_node), (parse_node, generate_answer_node),
174+
(generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)]
175+
}
176+
}
112177

113-
if self.config.get("html_mode") is False and self.config.get("reasoning") is True:
114-
115-
return BaseGraph(
116-
nodes=[
117-
fetch_node,
118-
parse_node,
119-
reasoning_node,
120-
generate_answer_node,
121-
],
122-
edges=[
123-
(fetch_node, parse_node),
124-
(parse_node, reasoning_node),
125-
(reasoning_node, generate_answer_node)
126-
],
127-
entry_point=fetch_node,
128-
graph_name=self.__class__.__name__
129-
)
130-
131-
elif self.config.get("html_mode") is True and self.config.get("reasoning") is True:
178+
# Get the current conditions
179+
html_mode = self.config.get("html_mode", False)
180+
reasoning = self.config.get("reasoning", False)
181+
reattempt = self.config.get("reattempt", False)
132182

133-
return BaseGraph(
134-
nodes=[
135-
fetch_node,
136-
reasoning_node,
137-
generate_answer_node,
138-
],
139-
edges=[
140-
(fetch_node, reasoning_node),
141-
(reasoning_node, generate_answer_node)
142-
],
143-
entry_point=fetch_node,
144-
graph_name=self.__class__.__name__
145-
)
183+
# Retrieve the appropriate graph configuration
184+
config = graph_variation_config.get((html_mode, reasoning, reattempt))
146185

147-
elif self.config.get("html_mode") is True and self.config.get("reasoning") is False:
186+
if config:
148187
return BaseGraph(
149-
nodes=[
150-
fetch_node,
151-
generate_answer_node,
152-
],
153-
edges=[
154-
(fetch_node, generate_answer_node)
155-
],
188+
nodes=config["nodes"],
189+
edges=config["edges"],
156190
entry_point=fetch_node,
157191
graph_name=self.__class__.__name__
158192
)
159193

194+
# Default return if no conditions match
160195
return BaseGraph(
161-
nodes=[
162-
fetch_node,
163-
parse_node,
164-
generate_answer_node,
165-
],
166-
edges=[
167-
(fetch_node, parse_node),
168-
(parse_node, generate_answer_node)
169-
],
170-
entry_point=fetch_node,
171-
graph_name=self.__class__.__name__
172-
)
173-
196+
nodes=[fetch_node, parse_node, generate_answer_node],
197+
edges=[(fetch_node, parse_node), (parse_node, generate_answer_node)],
198+
entry_point=fetch_node,
199+
graph_name=self.__class__.__name__
200+
)
201+
174202
def run(self) -> str:
175203
"""
176204
Executes the scraping process and returns the answer to the prompt.

scrapegraphai/nodes/conditional_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def execute(self, state: dict) -> dict:
6161
str: The name of the next node to execute based on the presence of the key.
6262
"""
6363

64-
if self.true_node_name is None or self.false_node_name is None:
64+
if self.true_node_name is None:
6565
raise ValueError("ConditionalNode's next nodes are not set properly.")
6666

6767
if self.condition:

scrapegraphai/prompts/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .generate_answer_node_prompts import (TEMPLATE_CHUNKS,
66
TEMPLATE_NO_CHUNKS,
77
TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD,
8-
TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD)
8+
TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD, REGEN_ADDITIONAL_INFO)
99
from .generate_answer_node_csv_prompts import (TEMPLATE_CHUKS_CSV,
1010
TEMPLATE_NO_CHUKS_CSV,
1111
TEMPLATE_MERGE_CSV)

scrapegraphai/prompts/generate_answer_node_prompts.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,7 @@
8686
USER QUESTION: {question}\n
8787
WEBSITE CONTENT: {context}\n
8888
"""
89+
90+
REGEN_ADDITIONAL_INFO = """
91+
You are a scraper and you have just failed to scrape the requested information from a website. \n
92+
I want you to try again and provide the missing informations. \n"""

0 commit comments

Comments
 (0)