Skip to content

Commit 02ec4c1

Browse files
authored
Merge pull request #704 from ScrapeGraphAI/refactoring-smart_scraper
feat: add html_mode to smart_scraper
2 parents e5ac020 + 1e4ee3a commit 02ec4c1

File tree

2 files changed

+86
-20
lines changed

2 files changed

+86
-20
lines changed

examples/extras/html_mode.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
By default smart scraper converts in md format the
4+
code. If you want to just use the original code, you have
5+
to specify in the confi
6+
"""
7+
8+
import os
9+
import json
10+
from dotenv import load_dotenv
11+
from scrapegraphai.graphs import SmartScraperGraph
12+
from scrapegraphai.utils import prettify_exec_info
13+
14+
load_dotenv()
15+
16+
# ************************************************
17+
# Define the configuration for the graph
18+
# ************************************************
19+
20+
21+
graph_config = {
22+
"llm": {
23+
"api_key": os.getenv("OPENAI_API_KEY"),
24+
"model": "openai/gpt-4o",
25+
},
26+
"html_mode": True,
27+
"verbose": True,
28+
"headless": False,
29+
}
30+
31+
# ************************************************
32+
# Create the SmartScraperGraph instance and run it
33+
# ************************************************
34+
35+
smart_scraper_graph = SmartScraperGraph(
36+
prompt="List me what does the company do, the name and a contact email.",
37+
source="https://scrapegraphai.com/",
38+
config=graph_config
39+
)
40+
41+
result = smart_scraper_graph.run()
42+
print(json.dumps(result, indent=4))
43+
44+
# ************************************************
45+
# Get graph execution info
46+
# ************************************************
47+
48+
graph_exec_info = smart_scraper_graph.get_execution_info()
49+
print(prettify_exec_info(graph_exec_info))

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,7 @@ def _create_graph(self) -> BaseGraph:
6969
"scrape_do": self.config.get("scrape_do")
7070
}
7171
)
72-
parse_node = ParseNode(
73-
input="doc",
74-
output=["parsed_doc"],
75-
node_config={
76-
"llm_model": self.llm_model,
77-
"chunk_size": self.model_token
78-
}
79-
)
72+
8073

8174
generate_answer_node = GenerateAnswerNode(
8275
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
@@ -88,19 +81,43 @@ def _create_graph(self) -> BaseGraph:
8881
}
8982
)
9083

84+
if self.config.get("html_mode") is not True:
85+
86+
parse_node = ParseNode(
87+
input="doc",
88+
output=["parsed_doc"],
89+
node_config={
90+
"llm_model": self.llm_model,
91+
"chunk_size": self.model_token
92+
}
93+
)
94+
95+
return BaseGraph(
96+
nodes=[
97+
fetch_node,
98+
parse_node,
99+
generate_answer_node,
100+
],
101+
edges=[
102+
(fetch_node, parse_node),
103+
(parse_node, generate_answer_node)
104+
],
105+
entry_point=fetch_node,
106+
graph_name=self.__class__.__name__
107+
)
108+
91109
return BaseGraph(
92-
nodes=[
93-
fetch_node,
94-
parse_node,
95-
generate_answer_node,
96-
],
97-
edges=[
98-
(fetch_node, parse_node),
99-
(parse_node, generate_answer_node)
100-
],
101-
entry_point=fetch_node,
102-
graph_name=self.__class__.__name__
103-
)
110+
nodes=[
111+
fetch_node,
112+
generate_answer_node,
113+
],
114+
edges=[
115+
(fetch_node, generate_answer_node)
116+
],
117+
entry_point=fetch_node,
118+
graph_name=self.__class__.__name__
119+
)
120+
104121

105122
def run(self) -> str:
106123
"""

0 commit comments

Comments
 (0)