File tree Expand file tree Collapse file tree 3 files changed +56
-4
lines changed Expand file tree Collapse file tree 3 files changed +56
-4
lines changed Original file line number Diff line number Diff line change
1
+ """
2
+ This example shows how to do not process the html code in the fetch phase
3
+ """
4
+
5
+ import os , json
6
+ from scrapegraphai .graphs import SmartScraperGraph
7
+ from scrapegraphai .utils import prettify_exec_info
8
+
9
+
10
+ # ************************************************
11
+ # Define the configuration for the graph
12
+ # ************************************************
13
+
14
+
15
+ graph_config = {
16
+ "llm" : {
17
+ "api_key" : "s" ,
18
+ "model" : "gpt-3.5-turbo" ,
19
+ },
20
+ "cut" : False ,
21
+ "verbose" : True ,
22
+ "headless" : False ,
23
+ }
24
+
25
+ # ************************************************
26
+ # Create the SmartScraperGraph instance and run it
27
+ # ************************************************
28
+
29
+ smart_scraper_graph = SmartScraperGraph (
30
+ prompt = "Extract me the python code inside the page" ,
31
+ source = "https://www.exploit-db.com/exploits/51447" ,
32
+ config = graph_config
33
+ )
34
+
35
+ result = smart_scraper_graph .run ()
36
+ print (json .dumps (result , indent = 4 ))
37
+
38
+ # ************************************************
39
+ # Get graph execution info
40
+ # ************************************************
41
+
42
+ graph_exec_info = smart_scraper_graph .get_execution_info ()
43
+ print (prettify_exec_info (graph_exec_info ))
Original file line number Diff line number Diff line change @@ -66,6 +66,8 @@ def _create_graph(self) -> BaseGraph:
66
66
output = ["doc" , "link_urls" , "img_urls" ],
67
67
node_config = {
68
68
"llm_model" : self .llm_model ,
69
+ "force" : self .config .get ("force" , False ),
70
+ "cut" : self .config .get ("cut" , True ),
69
71
"loader_kwargs" : self .config .get ("loader_kwargs" , {}),
70
72
}
71
73
)
Original file line number Diff line number Diff line change @@ -71,6 +71,10 @@ def __init__(
71
71
False if node_config is None else node_config .get ("script_creator" , False )
72
72
)
73
73
74
+ self .cut = (
75
+ False if node_config is None else node_config .get ("cut" , True )
76
+ )
77
+
74
78
def execute (self , state ):
75
79
"""
76
80
Executes the node's logic to fetch HTML content from a specified URL and
@@ -105,7 +109,7 @@ def execute(self, state):
105
109
compressed_document = [
106
110
source
107
111
]
108
-
112
+
109
113
state .update ({self .output [0 ]: compressed_document })
110
114
return state
111
115
# handling pdf
@@ -165,10 +169,13 @@ def execute(self, state):
165
169
if response .status_code == 200 :
166
170
if not response .text .strip ():
167
171
raise ValueError ("No HTML body content found in the response." )
172
+
173
+ parsed_content = response
174
+
175
+ if not self .cut :
176
+ parsed_content = cleanup_html (response , source )
168
177
169
- parsed_content = cleanup_html (response , source )
170
-
171
- if isinstance (self .llm_model , OpenAI ) and not self .script_creator or self .force and not self .script_creator and not :
178
+ if (isinstance (self .llm_model , OpenAI ) and not self .script_creator ) or (self .force and not self .script_creator ):
172
179
parsed_content = convert_to_md (source )
173
180
compressed_document = [Document (page_content = parsed_content )]
174
181
else :
You can’t perform that action at this time.
0 commit comments