Skip to content

Commit 99ad654

Browse files
authored
Merge branch 'pre/beta' into 713-pdf-scrapping
2 parents 26f89d8 + ac31d7f commit 99ad654

File tree

10 files changed

+379
-20
lines changed

10 files changed

+379
-20
lines changed

CHANGELOG.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,27 @@
1-
## [1.25.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.25.1) (2024-09-29)
1+
2+
## [1.26.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.26.0-beta.1) (2024-09-29)
3+
4+
5+
6+
* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81))
7+
* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24))
8+
29

310

411
### Bug Fixes
512

613
* removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279))
714

15+
* integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04))
16+
* removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279))
17+
18+
19+
### CI
20+
21+
* **release:** 1.22.0-beta.4 [skip ci] ([4330179](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4330179cb65674d65423c1763f90182e85c15a74))
22+
* **release:** 1.22.0-beta.5 [skip ci] ([6d8f543](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6d8f5435d1ecd2d90b06aade50abc064f75c9d78))
23+
* **release:** 1.22.0-beta.6 [skip ci] ([39f7815](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/39f78154a6f1123fa8aca5e169c803111c175473))
24+
825
## [1.25.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.24.1...v1.25.0) (2024-09-27)
926

1027

@@ -15,11 +32,15 @@
1532
## [1.24.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.24.0...v1.24.1) (2024-09-26)
1633

1734

35+
1836
### Bug Fixes
1937

2038
* script creator multi ([9905be8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9905be8a37dc1ff4b90fe9b8be987887253be8bd))
2139

2240
## [1.24.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.23.1...v1.24.0) (2024-09-26)
41+
* integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04))
42+
43+
## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27)
2344

2445

2546
### Features
@@ -44,6 +65,14 @@
4465
* **release:** 1.22.0-beta.1 [skip ci] ([f42a95f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f42a95faa05de39bd9cfc05e377d4b3da372e482))
4566
* **release:** 1.22.0-beta.2 [skip ci] ([431c09f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/431c09f551ac28581674c6061f055fde0350ed4c))
4667
* **release:** 1.22.0-beta.3 [skip ci] ([e5ac020](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e5ac0205d1e04a8b31e86166c3673915b70fd1e3))
68+
* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24))
69+
70+
## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27)
71+
72+
73+
### Features
74+
75+
* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81))
4776

4877
## [1.22.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.2...v1.22.0-beta.3) (2024-09-25)
4978

examples/extras/html_mode.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
By default smart scraper converts in md format the
4+
code. If you want to just use the original code, you have
5+
to specify in the confi
6+
"""
7+
8+
import os
9+
import json
10+
from dotenv import load_dotenv
11+
from scrapegraphai.graphs import SmartScraperGraph
12+
from scrapegraphai.utils import prettify_exec_info
13+
14+
load_dotenv()
15+
16+
# ************************************************
17+
# Define the configuration for the graph
18+
# ************************************************
19+
20+
21+
graph_config = {
22+
"llm": {
23+
"api_key": os.getenv("OPENAI_API_KEY"),
24+
"model": "openai/gpt-4o",
25+
},
26+
"html_mode": True,
27+
"verbose": True,
28+
"headless": False,
29+
}
30+
31+
# ************************************************
32+
# Create the SmartScraperGraph instance and run it
33+
# ************************************************
34+
35+
smart_scraper_graph = SmartScraperGraph(
36+
prompt="List me what does the company do, the name and a contact email.",
37+
source="https://scrapegraphai.com/",
38+
config=graph_config
39+
)
40+
41+
result = smart_scraper_graph.run()
42+
print(json.dumps(result, indent=4))
43+
44+
# ************************************************
45+
# Get graph execution info
46+
# ************************************************
47+
48+
graph_exec_info = smart_scraper_graph.get_execution_info()
49+
print(prettify_exec_info(graph_exec_info))

examples/extras/reasoning.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperGraph
9+
from scrapegraphai.utils import prettify_exec_info
10+
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
18+
graph_config = {
19+
"llm": {
20+
"api_key": os.getenv("OPENAI_API_KEY"),
21+
"model": "openai/gpt-4o",
22+
},
23+
"reasoning": True,
24+
"verbose": True,
25+
"headless": False,
26+
}
27+
28+
# ************************************************
29+
# Create the SmartScraperGraph instance and run it
30+
# ************************************************
31+
32+
smart_scraper_graph = SmartScraperGraph(
33+
prompt="List me what does the company do, the name and a contact email.",
34+
source="https://scrapegraphai.com/",
35+
config=graph_config
36+
)
37+
38+
result = smart_scraper_graph.run()
39+
print(json.dumps(result, indent=4))
40+
41+
# ************************************************
42+
# Get graph execution info
43+
# ************************************************
44+
45+
graph_exec_info = smart_scraper_graph.get_execution_info()
46+
print(prettify_exec_info(graph_exec_info))

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
[project]
22
name = "scrapegraphai"
33

4-
version = "1.25.1"
5-
4+
version = "1.26.0b1"
65

76
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
87
authors = [

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 81 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from ..nodes import (
1010
FetchNode,
1111
ParseNode,
12+
ReasoningNode,
1213
GenerateAnswerNode
1314
)
1415

@@ -69,7 +70,6 @@ def _create_graph(self) -> BaseGraph:
6970
"scrape_do": self.config.get("scrape_do")
7071
}
7172
)
72-
7373
parse_node = ParseNode(
7474
input="doc",
7575
output=["parsed_doc"],
@@ -89,19 +89,87 @@ def _create_graph(self) -> BaseGraph:
8989
}
9090
)
9191

92+
if self.config.get("html_mode") is False:
93+
parse_node = ParseNode(
94+
input="doc",
95+
output=["parsed_doc"],
96+
node_config={
97+
"llm_model": self.llm_model,
98+
"chunk_size": self.model_token
99+
}
100+
)
101+
102+
if self.config.get("reasoning"):
103+
reasoning_node = ReasoningNode(
104+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
105+
output=["answer"],
106+
node_config={
107+
"llm_model": self.llm_model,
108+
"additional_info": self.config.get("additional_info"),
109+
"schema": self.schema,
110+
}
111+
)
112+
113+
if self.config.get("html_mode") is False and self.config.get("reasoning") is True:
114+
115+
return BaseGraph(
116+
nodes=[
117+
fetch_node,
118+
parse_node,
119+
reasoning_node,
120+
generate_answer_node,
121+
],
122+
edges=[
123+
(fetch_node, parse_node),
124+
(parse_node, reasoning_node),
125+
(reasoning_node, generate_answer_node)
126+
],
127+
entry_point=fetch_node,
128+
graph_name=self.__class__.__name__
129+
)
130+
131+
elif self.config.get("html_mode") is True and self.config.get("reasoning") is True:
132+
133+
return BaseGraph(
134+
nodes=[
135+
fetch_node,
136+
reasoning_node,
137+
generate_answer_node,
138+
],
139+
edges=[
140+
(fetch_node, reasoning_node),
141+
(reasoning_node, generate_answer_node)
142+
],
143+
entry_point=fetch_node,
144+
graph_name=self.__class__.__name__
145+
)
146+
147+
elif self.config.get("html_mode") is True and self.config.get("reasoning") is False:
148+
return BaseGraph(
149+
nodes=[
150+
fetch_node,
151+
generate_answer_node,
152+
],
153+
edges=[
154+
(fetch_node, generate_answer_node)
155+
],
156+
entry_point=fetch_node,
157+
graph_name=self.__class__.__name__
158+
)
159+
92160
return BaseGraph(
93-
nodes=[
94-
fetch_node,
95-
parse_node,
96-
generate_answer_node,
97-
],
98-
edges=[
99-
(fetch_node, parse_node),
100-
(parse_node, generate_answer_node)
101-
],
102-
entry_point=fetch_node,
103-
graph_name=self.__class__.__name__
104-
)
161+
nodes=[
162+
fetch_node,
163+
parse_node,
164+
generate_answer_node,
165+
],
166+
edges=[
167+
(fetch_node, parse_node),
168+
(parse_node, generate_answer_node)
169+
],
170+
entry_point=fetch_node,
171+
graph_name=self.__class__.__name__
172+
)
105173

106174
def run(self) -> str:
107175
"""

scrapegraphai/nodes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@
2727
from .html_analyzer_node import HtmlAnalyzerNode
2828
from .generate_code_node import GenerateCodeNode
2929
from .search_node_with_context import SearchLinksWithContext
30+
from .reasoning_node import ReasoningNode

scrapegraphai/nodes/html_analyzer_node.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,7 @@
44
from typing import List, Optional
55
from langchain.prompts import PromptTemplate
66
from langchain_core.output_parsers import StrOutputParser
7-
from langchain_core.runnables import RunnableParallel
8-
from langchain_core.utils.pydantic import is_basemodel_subclass
97
from langchain_community.chat_models import ChatOllama
10-
from tqdm import tqdm
118
from .base_node import BaseNode
129
from ..utils import reduce_html
1310
from ..prompts import (

scrapegraphai/nodes/reasoning_node.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""
2+
PromptRefinerNode Module
3+
"""
4+
from typing import List, Optional
5+
from langchain.prompts import PromptTemplate
6+
from langchain_core.output_parsers import StrOutputParser
7+
from langchain_community.chat_models import ChatOllama
8+
from .base_node import BaseNode
9+
from ..utils import transform_schema
10+
from ..prompts import (
11+
TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT
12+
)
13+
14+
class ReasoningNode(BaseNode):
15+
"""
16+
A node that refine the user prompt with the use of the schema and additional context and
17+
create a precise prompt in subsequent steps that explicitly link elements in the user's
18+
original input to their corresponding representations in the JSON schema.
19+
20+
Attributes:
21+
llm_model: An instance of a language model client, configured for generating answers.
22+
verbose (bool): A flag indicating whether to show print statements during execution.
23+
24+
Args:
25+
input (str): Boolean expression defining the input keys needed from the state.
26+
output (List[str]): List of output keys to be updated in the state.
27+
node_config (dict): Additional configuration for the node.
28+
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
29+
"""
30+
31+
def __init__(
32+
self,
33+
input: str,
34+
output: List[str],
35+
node_config: Optional[dict] = None,
36+
node_name: str = "PromptRefiner",
37+
):
38+
super().__init__(node_name, "node", input, output, 2, node_config)
39+
40+
self.llm_model = node_config["llm_model"]
41+
42+
if isinstance(node_config["llm_model"], ChatOllama):
43+
self.llm_model.format="json"
44+
45+
self.verbose = (
46+
True if node_config is None else node_config.get("verbose", False)
47+
)
48+
self.force = (
49+
False if node_config is None else node_config.get("force", False)
50+
)
51+
52+
self.additional_info = node_config.get("additional_info", None)
53+
54+
self.output_schema = node_config.get("schema")
55+
56+
def execute(self, state: dict) -> dict:
57+
"""
58+
Generate a refined prompt for the reasoning task based
59+
on the user's input and the JSON schema.
60+
61+
Args:
62+
state (dict): The current state of the graph. The input keys will be used
63+
to fetch the correct data from the state.
64+
65+
Returns:
66+
dict: The updated state with the output key containing the generated answer.
67+
68+
Raises:
69+
KeyError: If the input keys are not found in the state, indicating
70+
that the necessary information for generating an answer is missing.
71+
"""
72+
73+
self.logger.info(f"--- Executing {self.node_name} Node ---")
74+
75+
user_prompt = state['user_prompt']
76+
77+
self.simplefied_schema = transform_schema(self.output_schema.schema())
78+
79+
if self.additional_info is not None:
80+
prompt = PromptTemplate(
81+
template=TEMPLATE_REASONING_WITH_CONTEXT,
82+
partial_variables={"user_input": user_prompt,
83+
"json_schema": str(self.simplefied_schema),
84+
"additional_context": self.additional_info})
85+
else:
86+
prompt = PromptTemplate(
87+
template=TEMPLATE_REASONING,
88+
partial_variables={"user_input": user_prompt,
89+
"json_schema": str(self.simplefied_schema)})
90+
91+
output_parser = StrOutputParser()
92+
93+
chain = prompt | self.llm_model | output_parser
94+
refined_prompt = chain.invoke({})
95+
96+
state.update({self.output[0]: refined_prompt})
97+
return state

0 commit comments

Comments
 (0)