ScrapeGraphAI · VinciGit00 · Nov 22, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,40 @@
+## [1.31.1-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.3...v1.31.1-beta.4) (2024-11-21)
+
+
+### Bug Fixes
+
+* add new model istance ([2f3cafe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2f3cafeab0bce38571fa10d71f454b2a31766ddc))
+
+## [1.31.1-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.2...v1.31.1-beta.3) (2024-11-21)
+
+
+### Bug Fixes
+
+* fetch node regex ([e2af232](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e2af2326f6c56e2abcc7dd5de9acdfb710507e0a))
+
+## [1.31.1-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1-beta.1...v1.31.1-beta.2) (2024-11-20)
+
+
+### Bug Fixes
+
+* generate answer node timeout ([32ef554](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/32ef5547f1d864c750cd47c115be6f38a1931d2c))
+
+## [1.31.1-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.0...v1.31.1-beta.1) (2024-11-20)
+
+
+### Bug Fixes
+
+* timeout ([c243106](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c243106552cec3b1df254c0d0a45401eb2f5c89d))
+
+
+### CI
+
+* **release:** 1.31.0-beta.1 [skip ci] ([1df7eb0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1df7eb0bcd923bc62fd19dddc0ce9b757e9742cf)), closes [#805](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/805) [#805](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/805)
+
 ## [1.31.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.30.0...v1.31.0) (2024-11-19)
 
 
+
 ### Features
 
 * refactoring of generate answer node ([1f465e6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1f465e636d2869e4e36555124767de026d3a66ae))

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,9 +3,7 @@ name = "scrapegraphai"
 
 
 
-version = "1.31.0"
-
-
+version = "1.31.1b4"
 
 
 

diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -30,6 +30,8 @@ anyio==4.4.0
 astroid==3.2.4
     # via pylint
 async-timeout==4.0.3
+    # via aiohttp
+    # via langchain
     # via scrapegraphai
 attrs==24.2.0
     # via aiohttp
@@ -78,6 +80,9 @@ distro==1.9.0
     # via openai
 docutils==0.19
     # via sphinx
+exceptiongroup==1.2.2
+    # via anyio
+    # via pytest
 fastapi==0.112.0
     # via burr
 fastapi-pagination==0.12.26
@@ -131,7 +136,6 @@ graphviz==0.20.3
     # via burr
 greenlet==3.0.3
     # via playwright
-    # via sqlalchemy
 grpcio==1.65.4
     # via google-api-core
     # via grpcio-status
@@ -500,6 +504,9 @@ tokenizers==0.19.1
     # via transformers
 toml==0.10.2
     # via streamlit
+tomli==2.1.0
+    # via pylint
+    # via pytest
 tomlkit==0.13.0
     # via pylint
 tornado==6.4.1
@@ -517,6 +524,8 @@ transformers==4.44.2
     # via scrapegraphai
 typing-extensions==4.12.2
     # via altair
+    # via anyio
+    # via astroid
     # via fastapi
     # via fastapi-pagination
     # via google-generativeai
@@ -531,6 +540,7 @@ typing-extensions==4.12.2
     # via sqlalchemy
     # via streamlit
     # via typing-inspect
+    # via uvicorn
 typing-inspect==0.9.0
     # via dataclasses-json
     # via sf-hamilton

diff --git a/requirements.lock b/requirements.lock
@@ -19,6 +19,8 @@ anyio==4.4.0
     # via httpx
     # via openai
 async-timeout==4.0.3
+    # via aiohttp
+    # via langchain
     # via scrapegraphai
 attrs==23.2.0
     # via aiohttp
@@ -48,6 +50,8 @@ dill==0.3.8
     # via multiprocess
 distro==1.9.0
     # via openai
+exceptiongroup==1.2.2
+    # via anyio
 fastembed==0.3.6
     # via scrapegraphai
 filelock==3.15.4
@@ -87,7 +91,6 @@ googlesearch-python==1.2.5
     # via scrapegraphai
 greenlet==3.0.3
     # via playwright
-    # via sqlalchemy
 grpcio==1.65.1
     # via google-api-core
     # via grpcio-status
@@ -368,6 +371,7 @@ tqdm==4.66.4
 transformers==4.44.2
     # via scrapegraphai
 typing-extensions==4.12.2
+    # via anyio
     # via google-generativeai
     # via huggingface-hub
     # via langchain-core

diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -161,13 +161,15 @@
         "claude-3-sonnet-20240229": 200000,
         "claude-3-haiku-20240307": 200000,
         "claude-3-5-sonnet-20240620": 200000,
+        "claude-3-5-haiku-latest": 200000,
         "claude-3-haiku-20240307": 4000,
     },
     "bedrock": {
         "anthropic.claude-3-haiku-20240307-v1:0": 200000,
         "anthropic.claude-3-sonnet-20240229-v1:0": 200000,
         "anthropic.claude-3-opus-20240229-v1:0": 200000,
         "anthropic.claude-3-5-sonnet-20240620-v1:0": 200000,
+        "claude-3-5-haiku-latest": 200000,
         "anthropic.claude-v2:1": 200000,
         "anthropic.claude-v2": 100000,
         "anthropic.claude-instant-v1": 100000,

diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -80,28 +80,30 @@ def __init__(
             None if node_config is None else node_config.get("scrape_do", None)
         )
 
+    def is_valid_url(self, source: str) -> bool:
+        """
+        Validates if the source string is a valid URL using regex.
+
+        Parameters:
+        source (str): The URL string to validate
+
+        Raises:
+        ValueError: If the URL is invalid
+        """
+        import re
+        url_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
+        if not bool(re.match(url_pattern, source)):
+            raise ValueError(f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain.")
+        return True
+
     def execute(self, state):
         """
         Executes the node's logic to fetch HTML content from a specified URL and
         update the state with this content.
-
-        Args:
-            state (dict): The current state of the graph. The input keys will be used
-                            to fetch the correct data types from the state.
-
-        Returns:
-            dict: The updated state with a new output key containing the fetched HTML content.
-
-        Raises:
-            KeyError: If the input key is not found in the state, indicating that the
-                    necessary information to perform the operation is missing.
         """
-
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        # Interpret input keys based on the provided input expression
         input_keys = self.get_input_keys(state)
-        # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
 
         source = input_data[0]
@@ -124,10 +126,16 @@ def execute(self, state):
             return handlers[input_type](state, input_type, source)
         elif self.input == "pdf_dir":
             return state
-        elif not source.startswith("http") and not source.startswith("www"):
-            return self.handle_local_source(state, source)
-        else:
-            return self.handle_web_source(state, source)
+
+        # For web sources, validate URL before proceeding
+        try:
+            if self.is_valid_url(source):
+                return self.handle_web_source(state, source)
+        except ValueError as e:
+            # Re-raise the exception from is_valid_url
+            raise
+
+        return self.handle_local_source(state, source)
 
     def handle_directory(self, state, input_type, source):
         """

diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -60,7 +60,22 @@ def __init__(
         self.script_creator = node_config.get("script_creator", False)
         self.is_md_scraper = node_config.get("is_md_scraper", False)
         self.additional_info = node_config.get("additional_info")
-        self.timeout = node_config.get("timeout", 30)
+        self.timeout = node_config.get("timeout", 120)
+
+    def invoke_with_timeout(self, chain, inputs, timeout):
+        """Helper method to invoke chain with timeout"""
+        try:
+            start_time = time.time()
+            response = chain.invoke(inputs)
+            if time.time() - start_time > timeout:
+                raise Timeout(f"Response took longer than {timeout} seconds")
+            return response
+        except Timeout as e:
+            self.logger.error(f"Timeout error: {str(e)}")
+            raise
+        except Exception as e:
+            self.logger.error(f"Error during chain execution: {str(e)}")
+            raise
 
     def execute(self, state: dict) -> dict:
         """
@@ -116,39 +131,22 @@ def execute(self, state: dict) -> dict:
             template_chunks_prompt = self.additional_info + template_chunks_prompt
             template_merge_prompt = self.additional_info + template_merge_prompt
 
-        def invoke_with_timeout(chain, inputs, timeout):
-            try:
-                with get_openai_callback() as cb:
-                    start_time = time.time()
-                    response = chain.invoke(inputs)
-                    if time.time() - start_time > timeout:
-                        raise Timeout(f"Response took longer than {timeout} seconds")
-                    return response
-            except Timeout as e:
-                self.logger.error(f"Timeout error: {str(e)}")
-                raise
-            except Exception as e:
-                self.logger.error(f"Error during chain execution: {str(e)}")
-                raise
-
         if len(doc) == 1:
             prompt = PromptTemplate(
                 template=template_no_chunks_prompt,
                 input_variables=["question"],
                 partial_variables={"context": doc, "format_instructions": format_instructions}
             )
             chain = prompt | self.llm_model
+            if output_parser:
+                chain = chain | output_parser
 
             try:
-                raw_response = invoke_with_timeout(chain, {"question": user_prompt}, self.timeout)
+                answer = self.invoke_with_timeout(chain, {"question": user_prompt}, self.timeout)
             except Timeout:
                 state.update({self.output[0]: {"error": "Response timeout exceeded"}})
                 return state
 
-            if output_parser:
-                chain = chain | output_parser
-
-            answer = chain.invoke({"question": user_prompt})
             state.update({self.output[0]: answer})
             return state
 
@@ -168,9 +166,9 @@ def invoke_with_timeout(chain, inputs, timeout):
 
         async_runner = RunnableParallel(**chains_dict)
         try:
-            batch_results = invoke_with_timeout(
-                async_runner, 
-                {"question": user_prompt}, 
+            batch_results = self.invoke_with_timeout(
+                async_runner,
+                {"question": user_prompt},
                 self.timeout
             )
         except Timeout:
@@ -187,7 +185,7 @@ def invoke_with_timeout(chain, inputs, timeout):
         if output_parser:
             merge_chain = merge_chain | output_parser
         try:
-            answer = invoke_with_timeout(
+            answer = self.invoke_with_timeout(
                 merge_chain,
                 {"context": batch_results, "question": user_prompt},
                 self.timeout
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,9 +3,7 @@ name = "scrapegraphai"



		version = "1.31.0"


		version = "1.31.1b4"



Expand Down