Skip to content

Commit 3914a5f

Browse files
authored
Merge pull request #819 from ScrapeGraphAI/pre/beta
2 parents 7c20fbe + b2720a4 commit 3914a5f

File tree

7 files changed

+102
-48
lines changed

7 files changed

+102
-48
lines changed

CHANGELOG.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,40 @@
1+
## [1.31.1-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.3...v1.31.1-beta.4) (2024-11-21)
2+
3+
4+
### Bug Fixes
5+
6+
* add new model istance ([2f3cafe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2f3cafeab0bce38571fa10d71f454b2a31766ddc))
7+
8+
## [1.31.1-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.2...v1.31.1-beta.3) (2024-11-21)
9+
10+
11+
### Bug Fixes
12+
13+
* fetch node regex ([e2af232](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e2af2326f6c56e2abcc7dd5de9acdfb710507e0a))
14+
15+
## [1.31.1-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.1...v1.31.1-beta.2) (2024-11-20)
16+
17+
18+
### Bug Fixes
19+
20+
* generate answer node timeout ([32ef554](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/32ef5547f1d864c750cd47c115be6f38a1931d2c))
21+
22+
## [1.31.1-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.0...v1.31.1-beta.1) (2024-11-20)
23+
24+
25+
### Bug Fixes
26+
27+
* timeout ([c243106](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c243106552cec3b1df254c0d0a45401eb2f5c89d))
28+
29+
30+
### CI
31+
32+
* **release:** 1.31.0-beta.1 [skip ci] ([1df7eb0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1df7eb0bcd923bc62fd19dddc0ce9b757e9742cf)), closes [#805](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/805) [#805](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/805)
33+
134
## [1.31.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.30.0...v1.31.0) (2024-11-19)
235

336

37+
438
### Features
539

640
* refactoring of generate answer node ([1f465e6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1f465e636d2869e4e36555124767de026d3a66ae))

pyproject.toml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@ name = "scrapegraphai"
33

44

55

6-
version = "1.31.0"
7-
8-
6+
version = "1.31.1b4"
97

108

119

requirements-dev.lock

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ anyio==4.4.0
3030
astroid==3.2.4
3131
# via pylint
3232
async-timeout==4.0.3
33+
# via aiohttp
34+
# via langchain
3335
# via scrapegraphai
3436
attrs==24.2.0
3537
# via aiohttp
@@ -78,6 +80,9 @@ distro==1.9.0
7880
# via openai
7981
docutils==0.19
8082
# via sphinx
83+
exceptiongroup==1.2.2
84+
# via anyio
85+
# via pytest
8186
fastapi==0.112.0
8287
# via burr
8388
fastapi-pagination==0.12.26
@@ -131,7 +136,6 @@ graphviz==0.20.3
131136
# via burr
132137
greenlet==3.0.3
133138
# via playwright
134-
# via sqlalchemy
135139
grpcio==1.65.4
136140
# via google-api-core
137141
# via grpcio-status
@@ -500,6 +504,9 @@ tokenizers==0.19.1
500504
# via transformers
501505
toml==0.10.2
502506
# via streamlit
507+
tomli==2.1.0
508+
# via pylint
509+
# via pytest
503510
tomlkit==0.13.0
504511
# via pylint
505512
tornado==6.4.1
@@ -517,6 +524,8 @@ transformers==4.44.2
517524
# via scrapegraphai
518525
typing-extensions==4.12.2
519526
# via altair
527+
# via anyio
528+
# via astroid
520529
# via fastapi
521530
# via fastapi-pagination
522531
# via google-generativeai
@@ -531,6 +540,7 @@ typing-extensions==4.12.2
531540
# via sqlalchemy
532541
# via streamlit
533542
# via typing-inspect
543+
# via uvicorn
534544
typing-inspect==0.9.0
535545
# via dataclasses-json
536546
# via sf-hamilton

requirements.lock

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ anyio==4.4.0
1919
# via httpx
2020
# via openai
2121
async-timeout==4.0.3
22+
# via aiohttp
23+
# via langchain
2224
# via scrapegraphai
2325
attrs==23.2.0
2426
# via aiohttp
@@ -48,6 +50,8 @@ dill==0.3.8
4850
# via multiprocess
4951
distro==1.9.0
5052
# via openai
53+
exceptiongroup==1.2.2
54+
# via anyio
5155
fastembed==0.3.6
5256
# via scrapegraphai
5357
filelock==3.15.4
@@ -87,7 +91,6 @@ googlesearch-python==1.2.5
8791
# via scrapegraphai
8892
greenlet==3.0.3
8993
# via playwright
90-
# via sqlalchemy
9194
grpcio==1.65.1
9295
# via google-api-core
9396
# via grpcio-status
@@ -368,6 +371,7 @@ tqdm==4.66.4
368371
transformers==4.44.2
369372
# via scrapegraphai
370373
typing-extensions==4.12.2
374+
# via anyio
371375
# via google-generativeai
372376
# via huggingface-hub
373377
# via langchain-core

scrapegraphai/helpers/models_tokens.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,13 +161,15 @@
161161
"claude-3-sonnet-20240229": 200000,
162162
"claude-3-haiku-20240307": 200000,
163163
"claude-3-5-sonnet-20240620": 200000,
164+
"claude-3-5-haiku-latest": 200000,
164165
"claude-3-haiku-20240307": 4000,
165166
},
166167
"bedrock": {
167168
"anthropic.claude-3-haiku-20240307-v1:0": 200000,
168169
"anthropic.claude-3-sonnet-20240229-v1:0": 200000,
169170
"anthropic.claude-3-opus-20240229-v1:0": 200000,
170171
"anthropic.claude-3-5-sonnet-20240620-v1:0": 200000,
172+
"claude-3-5-haiku-latest": 200000,
171173
"anthropic.claude-v2:1": 200000,
172174
"anthropic.claude-v2": 100000,
173175
"anthropic.claude-instant-v1": 100000,

scrapegraphai/nodes/fetch_node.py

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -80,28 +80,30 @@ def __init__(
8080
None if node_config is None else node_config.get("scrape_do", None)
8181
)
8282

83+
def is_valid_url(self, source: str) -> bool:
84+
"""
85+
Validates if the source string is a valid URL using regex.
86+
87+
Parameters:
88+
source (str): The URL string to validate
89+
90+
Raises:
91+
ValueError: If the URL is invalid
92+
"""
93+
import re
94+
url_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
95+
if not bool(re.match(url_pattern, source)):
96+
raise ValueError(f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain.")
97+
return True
98+
8399
def execute(self, state):
84100
"""
85101
Executes the node's logic to fetch HTML content from a specified URL and
86102
update the state with this content.
87-
88-
Args:
89-
state (dict): The current state of the graph. The input keys will be used
90-
to fetch the correct data types from the state.
91-
92-
Returns:
93-
dict: The updated state with a new output key containing the fetched HTML content.
94-
95-
Raises:
96-
KeyError: If the input key is not found in the state, indicating that the
97-
necessary information to perform the operation is missing.
98103
"""
99-
100104
self.logger.info(f"--- Executing {self.node_name} Node ---")
101105

102-
# Interpret input keys based on the provided input expression
103106
input_keys = self.get_input_keys(state)
104-
# Fetching data from the state based on the input keys
105107
input_data = [state[key] for key in input_keys]
106108

107109
source = input_data[0]
@@ -124,10 +126,16 @@ def execute(self, state):
124126
return handlers[input_type](state, input_type, source)
125127
elif self.input == "pdf_dir":
126128
return state
127-
elif not source.startswith("http") and not source.startswith("www"):
128-
return self.handle_local_source(state, source)
129-
else:
130-
return self.handle_web_source(state, source)
129+
130+
# For web sources, validate URL before proceeding
131+
try:
132+
if self.is_valid_url(source):
133+
return self.handle_web_source(state, source)
134+
except ValueError as e:
135+
# Re-raise the exception from is_valid_url
136+
raise
137+
138+
return self.handle_local_source(state, source)
131139

132140
def handle_directory(self, state, input_type, source):
133141
"""

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,22 @@ def __init__(
6060
self.script_creator = node_config.get("script_creator", False)
6161
self.is_md_scraper = node_config.get("is_md_scraper", False)
6262
self.additional_info = node_config.get("additional_info")
63-
self.timeout = node_config.get("timeout", 30)
63+
self.timeout = node_config.get("timeout", 120)
64+
65+
def invoke_with_timeout(self, chain, inputs, timeout):
66+
"""Helper method to invoke chain with timeout"""
67+
try:
68+
start_time = time.time()
69+
response = chain.invoke(inputs)
70+
if time.time() - start_time > timeout:
71+
raise Timeout(f"Response took longer than {timeout} seconds")
72+
return response
73+
except Timeout as e:
74+
self.logger.error(f"Timeout error: {str(e)}")
75+
raise
76+
except Exception as e:
77+
self.logger.error(f"Error during chain execution: {str(e)}")
78+
raise
6479

6580
def execute(self, state: dict) -> dict:
6681
"""
@@ -116,39 +131,22 @@ def execute(self, state: dict) -> dict:
116131
template_chunks_prompt = self.additional_info + template_chunks_prompt
117132
template_merge_prompt = self.additional_info + template_merge_prompt
118133

119-
def invoke_with_timeout(chain, inputs, timeout):
120-
try:
121-
with get_openai_callback() as cb:
122-
start_time = time.time()
123-
response = chain.invoke(inputs)
124-
if time.time() - start_time > timeout:
125-
raise Timeout(f"Response took longer than {timeout} seconds")
126-
return response
127-
except Timeout as e:
128-
self.logger.error(f"Timeout error: {str(e)}")
129-
raise
130-
except Exception as e:
131-
self.logger.error(f"Error during chain execution: {str(e)}")
132-
raise
133-
134134
if len(doc) == 1:
135135
prompt = PromptTemplate(
136136
template=template_no_chunks_prompt,
137137
input_variables=["question"],
138138
partial_variables={"context": doc, "format_instructions": format_instructions}
139139
)
140140
chain = prompt | self.llm_model
141+
if output_parser:
142+
chain = chain | output_parser
141143

142144
try:
143-
raw_response = invoke_with_timeout(chain, {"question": user_prompt}, self.timeout)
145+
answer = self.invoke_with_timeout(chain, {"question": user_prompt}, self.timeout)
144146
except Timeout:
145147
state.update({self.output[0]: {"error": "Response timeout exceeded"}})
146148
return state
147149

148-
if output_parser:
149-
chain = chain | output_parser
150-
151-
answer = chain.invoke({"question": user_prompt})
152150
state.update({self.output[0]: answer})
153151
return state
154152

@@ -168,9 +166,9 @@ def invoke_with_timeout(chain, inputs, timeout):
168166

169167
async_runner = RunnableParallel(**chains_dict)
170168
try:
171-
batch_results = invoke_with_timeout(
172-
async_runner,
173-
{"question": user_prompt},
169+
batch_results = self.invoke_with_timeout(
170+
async_runner,
171+
{"question": user_prompt},
174172
self.timeout
175173
)
176174
except Timeout:
@@ -187,7 +185,7 @@ def invoke_with_timeout(chain, inputs, timeout):
187185
if output_parser:
188186
merge_chain = merge_chain | output_parser
189187
try:
190-
answer = invoke_with_timeout(
188+
answer = self.invoke_with_timeout(
191189
merge_chain,
192190
{"context": batch_results, "question": user_prompt},
193191
self.timeout

0 commit comments

Comments
 (0)