Skip to content

Commit 9b45ebc

Browse files
committed
modify fetch node with no cut mode
1 parent 228a1de commit 9b45ebc

File tree

3 files changed

+56
-4
lines changed

3 files changed

+56
-4
lines changed

examples/extras/no_cut.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""
2+
This example shows how to do not process the html code in the fetch phase
3+
"""
4+
5+
import os, json
6+
from scrapegraphai.graphs import SmartScraperGraph
7+
from scrapegraphai.utils import prettify_exec_info
8+
9+
10+
# ************************************************
11+
# Define the configuration for the graph
12+
# ************************************************
13+
14+
15+
graph_config = {
16+
"llm": {
17+
"api_key": "s",
18+
"model": "gpt-3.5-turbo",
19+
},
20+
"cut": False,
21+
"verbose": True,
22+
"headless": False,
23+
}
24+
25+
# ************************************************
26+
# Create the SmartScraperGraph instance and run it
27+
# ************************************************
28+
29+
smart_scraper_graph = SmartScraperGraph(
30+
prompt="Extract me the python code inside the page",
31+
source="https://www.exploit-db.com/exploits/51447",
32+
config=graph_config
33+
)
34+
35+
result = smart_scraper_graph.run()
36+
print(json.dumps(result, indent=4))
37+
38+
# ************************************************
39+
# Get graph execution info
40+
# ************************************************
41+
42+
graph_exec_info = smart_scraper_graph.get_execution_info()
43+
print(prettify_exec_info(graph_exec_info))

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ def _create_graph(self) -> BaseGraph:
6666
output=["doc", "link_urls", "img_urls"],
6767
node_config={
6868
"llm_model": self.llm_model,
69+
"force": self.config.get("force", False),
70+
"cut": self.config.get("cut", True),
6971
"loader_kwargs": self.config.get("loader_kwargs", {}),
7072
}
7173
)

scrapegraphai/nodes/fetch_node.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ def __init__(
7171
False if node_config is None else node_config.get("script_creator", False)
7272
)
7373

74+
self.cut = (
75+
False if node_config is None else node_config.get("cut", True)
76+
)
77+
7478
def execute(self, state):
7579
"""
7680
Executes the node's logic to fetch HTML content from a specified URL and
@@ -105,7 +109,7 @@ def execute(self, state):
105109
compressed_document = [
106110
source
107111
]
108-
112+
109113
state.update({self.output[0]: compressed_document})
110114
return state
111115
# handling pdf
@@ -165,10 +169,13 @@ def execute(self, state):
165169
if response.status_code == 200:
166170
if not response.text.strip():
167171
raise ValueError("No HTML body content found in the response.")
172+
173+
parsed_content = response
174+
175+
if not self.cut:
176+
parsed_content = cleanup_html(response, source)
168177

169-
parsed_content = cleanup_html(response, source)
170-
171-
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not:
178+
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
172179
parsed_content = convert_to_md(source)
173180
compressed_document = [Document(page_content=parsed_content)]
174181
else:

0 commit comments

Comments
 (0)