Skip to content

Pre/beta #819

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,40 @@
## [1.31.1-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.3...v1.31.1-beta.4) (2024-11-21)


### Bug Fixes

* add new model istance ([2f3cafe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2f3cafeab0bce38571fa10d71f454b2a31766ddc))

## [1.31.1-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.2...v1.31.1-beta.3) (2024-11-21)


### Bug Fixes

* fetch node regex ([e2af232](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e2af2326f6c56e2abcc7dd5de9acdfb710507e0a))

## [1.31.1-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.1...v1.31.1-beta.2) (2024-11-20)


### Bug Fixes

* generate answer node timeout ([32ef554](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/32ef5547f1d864c750cd47c115be6f38a1931d2c))

## [1.31.1-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.0...v1.31.1-beta.1) (2024-11-20)


### Bug Fixes

* timeout ([c243106](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c243106552cec3b1df254c0d0a45401eb2f5c89d))


### CI

* **release:** 1.31.0-beta.1 [skip ci] ([1df7eb0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1df7eb0bcd923bc62fd19dddc0ce9b757e9742cf)), closes [#805](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/805) [#805](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/805)

## [1.31.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.30.0...v1.31.0) (2024-11-19)



### Features

* refactoring of generate answer node ([1f465e6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1f465e636d2869e4e36555124767de026d3a66ae))
Expand Down
4 changes: 1 addition & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@ name = "scrapegraphai"



version = "1.31.0"


version = "1.31.1b4"



Expand Down
12 changes: 11 additions & 1 deletion requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ anyio==4.4.0
astroid==3.2.4
# via pylint
async-timeout==4.0.3
# via aiohttp
# via langchain
# via scrapegraphai
attrs==24.2.0
# via aiohttp
Expand Down Expand Up @@ -78,6 +80,9 @@ distro==1.9.0
# via openai
docutils==0.19
# via sphinx
exceptiongroup==1.2.2
# via anyio
# via pytest
fastapi==0.112.0
# via burr
fastapi-pagination==0.12.26
Expand Down Expand Up @@ -131,7 +136,6 @@ graphviz==0.20.3
# via burr
greenlet==3.0.3
# via playwright
# via sqlalchemy
grpcio==1.65.4
# via google-api-core
# via grpcio-status
Expand Down Expand Up @@ -500,6 +504,9 @@ tokenizers==0.19.1
# via transformers
toml==0.10.2
# via streamlit
tomli==2.1.0
# via pylint
# via pytest
tomlkit==0.13.0
# via pylint
tornado==6.4.1
Expand All @@ -517,6 +524,8 @@ transformers==4.44.2
# via scrapegraphai
typing-extensions==4.12.2
# via altair
# via anyio
# via astroid
# via fastapi
# via fastapi-pagination
# via google-generativeai
Expand All @@ -531,6 +540,7 @@ typing-extensions==4.12.2
# via sqlalchemy
# via streamlit
# via typing-inspect
# via uvicorn
typing-inspect==0.9.0
# via dataclasses-json
# via sf-hamilton
Expand Down
6 changes: 5 additions & 1 deletion requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ anyio==4.4.0
# via httpx
# via openai
async-timeout==4.0.3
# via aiohttp
# via langchain
# via scrapegraphai
attrs==23.2.0
# via aiohttp
Expand Down Expand Up @@ -48,6 +50,8 @@ dill==0.3.8
# via multiprocess
distro==1.9.0
# via openai
exceptiongroup==1.2.2
# via anyio
fastembed==0.3.6
# via scrapegraphai
filelock==3.15.4
Expand Down Expand Up @@ -87,7 +91,6 @@ googlesearch-python==1.2.5
# via scrapegraphai
greenlet==3.0.3
# via playwright
# via sqlalchemy
grpcio==1.65.1
# via google-api-core
# via grpcio-status
Expand Down Expand Up @@ -368,6 +371,7 @@ tqdm==4.66.4
transformers==4.44.2
# via scrapegraphai
typing-extensions==4.12.2
# via anyio
# via google-generativeai
# via huggingface-hub
# via langchain-core
Expand Down
2 changes: 2 additions & 0 deletions scrapegraphai/helpers/models_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,15 @@
"claude-3-sonnet-20240229": 200000,
"claude-3-haiku-20240307": 200000,
"claude-3-5-sonnet-20240620": 200000,
"claude-3-5-haiku-latest": 200000,
"claude-3-haiku-20240307": 4000,
},
"bedrock": {
"anthropic.claude-3-haiku-20240307-v1:0": 200000,
"anthropic.claude-3-sonnet-20240229-v1:0": 200000,
"anthropic.claude-3-opus-20240229-v1:0": 200000,
"anthropic.claude-3-5-sonnet-20240620-v1:0": 200000,
"claude-3-5-haiku-latest": 200000,
"anthropic.claude-v2:1": 200000,
"anthropic.claude-v2": 100000,
"anthropic.claude-instant-v1": 100000,
Expand Down
44 changes: 26 additions & 18 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,28 +80,30 @@ def __init__(
None if node_config is None else node_config.get("scrape_do", None)
)

def is_valid_url(self, source: str) -> bool:
"""
Validates if the source string is a valid URL using regex.

Parameters:
source (str): The URL string to validate

Raises:
ValueError: If the URL is invalid
"""
import re
url_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
if not bool(re.match(url_pattern, source)):
raise ValueError(f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain.")
return True

def execute(self, state):
"""
Executes the node's logic to fetch HTML content from a specified URL and
update the state with this content.

Args:
state (dict): The current state of the graph. The input keys will be used
to fetch the correct data types from the state.

Returns:
dict: The updated state with a new output key containing the fetched HTML content.

Raises:
KeyError: If the input key is not found in the state, indicating that the
necessary information to perform the operation is missing.
"""

self.logger.info(f"--- Executing {self.node_name} Node ---")

# Interpret input keys based on the provided input expression
input_keys = self.get_input_keys(state)
# Fetching data from the state based on the input keys
input_data = [state[key] for key in input_keys]

source = input_data[0]
Expand All @@ -124,10 +126,16 @@ def execute(self, state):
return handlers[input_type](state, input_type, source)
elif self.input == "pdf_dir":
return state
elif not source.startswith("http") and not source.startswith("www"):
return self.handle_local_source(state, source)
else:
return self.handle_web_source(state, source)

# For web sources, validate URL before proceeding
try:
if self.is_valid_url(source):
return self.handle_web_source(state, source)
except ValueError as e:
# Re-raise the exception from is_valid_url
raise

return self.handle_local_source(state, source)

def handle_directory(self, state, input_type, source):
"""
Expand Down
48 changes: 23 additions & 25 deletions scrapegraphai/nodes/generate_answer_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,22 @@ def __init__(
self.script_creator = node_config.get("script_creator", False)
self.is_md_scraper = node_config.get("is_md_scraper", False)
self.additional_info = node_config.get("additional_info")
self.timeout = node_config.get("timeout", 30)
self.timeout = node_config.get("timeout", 120)

def invoke_with_timeout(self, chain, inputs, timeout):
"""Helper method to invoke chain with timeout"""
try:
start_time = time.time()
response = chain.invoke(inputs)
if time.time() - start_time > timeout:
raise Timeout(f"Response took longer than {timeout} seconds")
return response
except Timeout as e:
self.logger.error(f"Timeout error: {str(e)}")
raise
except Exception as e:
self.logger.error(f"Error during chain execution: {str(e)}")
raise

def execute(self, state: dict) -> dict:
"""
Expand Down Expand Up @@ -116,39 +131,22 @@ def execute(self, state: dict) -> dict:
template_chunks_prompt = self.additional_info + template_chunks_prompt
template_merge_prompt = self.additional_info + template_merge_prompt

def invoke_with_timeout(chain, inputs, timeout):
try:
with get_openai_callback() as cb:
start_time = time.time()
response = chain.invoke(inputs)
if time.time() - start_time > timeout:
raise Timeout(f"Response took longer than {timeout} seconds")
return response
except Timeout as e:
self.logger.error(f"Timeout error: {str(e)}")
raise
except Exception as e:
self.logger.error(f"Error during chain execution: {str(e)}")
raise

if len(doc) == 1:
prompt = PromptTemplate(
template=template_no_chunks_prompt,
input_variables=["question"],
partial_variables={"context": doc, "format_instructions": format_instructions}
)
chain = prompt | self.llm_model
if output_parser:
chain = chain | output_parser

try:
raw_response = invoke_with_timeout(chain, {"question": user_prompt}, self.timeout)
answer = self.invoke_with_timeout(chain, {"question": user_prompt}, self.timeout)
except Timeout:
state.update({self.output[0]: {"error": "Response timeout exceeded"}})
return state

if output_parser:
chain = chain | output_parser

answer = chain.invoke({"question": user_prompt})
state.update({self.output[0]: answer})
return state

Expand All @@ -168,9 +166,9 @@ def invoke_with_timeout(chain, inputs, timeout):

async_runner = RunnableParallel(**chains_dict)
try:
batch_results = invoke_with_timeout(
async_runner,
{"question": user_prompt},
batch_results = self.invoke_with_timeout(
async_runner,
{"question": user_prompt},
self.timeout
)
except Timeout:
Expand All @@ -187,7 +185,7 @@ def invoke_with_timeout(chain, inputs, timeout):
if output_parser:
merge_chain = merge_chain | output_parser
try:
answer = invoke_with_timeout(
answer = self.invoke_with_timeout(
merge_chain,
{"context": batch_results, "question": user_prompt},
self.timeout
Expand Down