Skip to content

Commit bdcffd6

Browse files
committed
feat: add html_mode to smart_scraper
1 parent e5ac020 commit bdcffd6

File tree

2 files changed

+85
-20
lines changed

2 files changed

+85
-20
lines changed

examples/extras/html_mode.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
By default smart scraper converts in md format the
4+
code.
5+
"""
6+
7+
import os
8+
import json
9+
from dotenv import load_dotenv
10+
from scrapegraphai.graphs import SmartScraperGraph
11+
from scrapegraphai.utils import prettify_exec_info
12+
13+
load_dotenv()
14+
15+
# ************************************************
16+
# Define the configuration for the graph
17+
# ************************************************
18+
19+
20+
graph_config = {
21+
"llm": {
22+
"api_key": os.getenv("OPENAI_API_KEY"),
23+
"model": "openai/gpt-4o",
24+
},
25+
"html_mode": True,
26+
"verbose": True,
27+
"headless": False,
28+
}
29+
30+
# ************************************************
31+
# Create the SmartScraperGraph instance and run it
32+
# ************************************************
33+
34+
smart_scraper_graph = SmartScraperGraph(
35+
prompt="List me what does the company do, the name and a contact email.",
36+
source="https://scrapegraphai.com/",
37+
config=graph_config
38+
)
39+
40+
result = smart_scraper_graph.run()
41+
print(json.dumps(result, indent=4))
42+
43+
# ************************************************
44+
# Get graph execution info
45+
# ************************************************
46+
47+
graph_exec_info = smart_scraper_graph.get_execution_info()
48+
print(prettify_exec_info(graph_exec_info))

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,7 @@ def _create_graph(self) -> BaseGraph:
6969
"scrape_do": self.config.get("scrape_do")
7070
}
7171
)
72-
parse_node = ParseNode(
73-
input="doc",
74-
output=["parsed_doc"],
75-
node_config={
76-
"llm_model": self.llm_model,
77-
"chunk_size": self.model_token
78-
}
79-
)
72+
8073

8174
generate_answer_node = GenerateAnswerNode(
8275
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
@@ -88,19 +81,43 @@ def _create_graph(self) -> BaseGraph:
8881
}
8982
)
9083

84+
if self.config.get("html_mode") is not True:
85+
86+
parse_node = ParseNode(
87+
input="doc",
88+
output=["parsed_doc"],
89+
node_config={
90+
"llm_model": self.llm_model,
91+
"chunk_size": self.model_token
92+
}
93+
)
94+
95+
return BaseGraph(
96+
nodes=[
97+
fetch_node,
98+
parse_node,
99+
generate_answer_node,
100+
],
101+
edges=[
102+
(fetch_node, parse_node),
103+
(parse_node, generate_answer_node)
104+
],
105+
entry_point=fetch_node,
106+
graph_name=self.__class__.__name__
107+
)
108+
91109
return BaseGraph(
92-
nodes=[
93-
fetch_node,
94-
parse_node,
95-
generate_answer_node,
96-
],
97-
edges=[
98-
(fetch_node, parse_node),
99-
(parse_node, generate_answer_node)
100-
],
101-
entry_point=fetch_node,
102-
graph_name=self.__class__.__name__
103-
)
110+
nodes=[
111+
fetch_node,
112+
generate_answer_node,
113+
],
114+
edges=[
115+
(fetch_node, generate_answer_node)
116+
],
117+
entry_point=fetch_node,
118+
graph_name=self.__class__.__name__
119+
)
120+
104121

105122
def run(self) -> str:
106123
"""

0 commit comments

Comments
 (0)