feat: refactoring of the code

VinciGit00 · VinciGit00 · commit 9355507a2dc7 · 2024-08-02T12:00:00.000+02:00
diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py
@@ -86,7 +86,8 @@ def update_config(self, params: dict, overwrite: bool = False):
 
         Args:
             param (dict): The dictionary to update node_config with.
-            overwrite (bool): Flag indicating if the values of node_config should be overwritten if their value is not None.
+            overwrite (bool): Flag indicating if the values of node_config 
+            should be overwritten if their value is not None.
         """
         for key, val in params.items():
             if hasattr(self, key) and not overwrite:
@@ -133,7 +134,8 @@ def _validate_input_keys(self, input_keys):
 
     def _parse_input_keys(self, state: dict, expression: str) -> List[str]:
         """
-        Parses the input keys expression to extract relevant keys from the state based on logical conditions.
+        Parses the input keys expression to extract 
+        relevant keys from the state based on logical conditions.
         The expression can contain AND (&), OR (|), and parentheses to group conditions.
 
         Args:
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -133,7 +133,7 @@ def execute(self, state):
             state.update({self.output[0]: compressed_document})
             return state
         elif input_keys[0] == "json":
-            f = open(source)
+            f = open(source, encoding="utf-8")
             compressed_document = [
                 Document(page_content=str(json.load(f)), metadata={"source": "json"})
             ]
@@ -181,12 +181,11 @@ def execute(self, state):
                 if not response.text.strip():
                     raise ValueError("No HTML body content found in the response.")
 
-                parsed_content = response
-   
                 if not self.cut:
                     parsed_content = cleanup_html(response, source)
 
-                if  (isinstance(self.llm_model, ChatOpenAI) and not self.script_creator) or (self.force and not self.script_creator):
+                if  (isinstance(self.llm_model, ChatOpenAI)
+                     and not self.script_creator) or (self.force and not self.script_creator):
                     parsed_content = convert_to_md(source, input_data[0])
                 compressed_document = [Document(page_content=parsed_content)]
             else:
@@ -205,7 +204,8 @@ def execute(self, state):
                 data =  browser_base_fetch(self.browser_base.get("api_key"),
                                             self.browser_base.get("project_id"), [source])
 
-                document = [Document(page_content=content, metadata={"source": source}) for content in data]
+                document = [Document(page_content=content,
+                                    metadata={"source": source}) for content in data]
             else:
                 loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
                 document = loader.load()
@@ -215,10 +215,8 @@ def execute(self, state):
             parsed_content = document[0].page_content
 
             if  isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
-
                 parsed_content = convert_to_md(document[0].page_content, input_data[0])
 
-
             compressed_document = [
                 Document(page_content=parsed_content, metadata={"source": "html file"})
             ]
diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py
@@ -3,18 +3,12 @@
 Module for generating the answer node
 """
 
-# Imports from standard library
 from typing import List, Optional
-
-# Imports from Langchain
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
 from tqdm import tqdm
-
 from ..utils.logging import get_logger
-
-# Imports from the library
 from .base_node import BaseNode
 from ..helpers.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv
 
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -1,15 +1,13 @@
 """
 GenerateAnswerNode Module
 """
-import asyncio
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
 from langchain_openai import ChatOpenAI
 from langchain_community.chat_models import ChatOllama
 from tqdm import tqdm
-from langchain_openai import ChatOpenAI
 from ..utils.logging import get_logger
 from .base_node import BaseNode
 from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md
@@ -130,7 +128,6 @@ def execute(self, state: dict) -> dict:
                 partial_variables={"context": chunk,
                                 "chunk_id": i + 1,
                                 "format_instructions": format_instructions})
-            # Add chain to dictionary with dynamic name
             chain_name = f"chunk{i+1}"
             chains_dict[chain_name] = prompt | self.llm_model | output_parser
 
diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py
@@ -113,7 +113,7 @@ def execute(self, state: dict) -> dict:
 
             chain =  prompt | self.llm_model | output_parser
             answer = chain.invoke({"question": user_prompt})
-            
+
             state.update({self.output[0]: answer})
             return state
 
@@ -148,4 +148,4 @@ def execute(self, state: dict) -> dict:
         answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
 
         state.update({self.output[0]: answer})
-        return state
+        return state
diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py
@@ -2,18 +2,13 @@
 Module for generating the answer node
 """
 
-# Imports from standard library
 from typing import List, Optional
-
-# Imports from Langchain
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
 from tqdm import tqdm
 from langchain_community.chat_models import ChatOllama
 from ..utils.logging import get_logger
-
-# Imports from the library
 from .base_node import BaseNode
 from ..helpers.generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf
 
diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py
@@ -83,7 +83,6 @@ def execute(self, state: dict) -> dict:
         user_prompt = input_data[0]
         doc = input_data[1]
 
-        # schema to be used for output parsing
         if self.node_config.get("schema", None) is not None:
             output_schema = JsonOutputParser(pydantic_object=self.node_config["schema"])
         else:
@@ -130,7 +129,6 @@ def execute(self, state: dict) -> dict:
         )
         map_chain = prompt | self.llm_model | StrOutputParser()
 
-        # Chain
         answer = map_chain.invoke({"question": user_prompt})
 
         state.update({self.output[0]: answer})
diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py
@@ -1,7 +1,6 @@
 """
 GetProbableTagsNode Module
 """
-
 from typing import List, Optional
 from langchain.output_parsers import CommaSeparatedListOutputParser
 from langchain.prompts import PromptTemplate
diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py
@@ -5,13 +5,11 @@
 import asyncio
 import copy
 from typing import List, Optional
-
 from tqdm.asyncio import tqdm
-
 from ..utils.logging import get_logger
 from .base_node import BaseNode
 
-_default_batchsize = 16
+DEFAULT_BATCHSIZE = 16
 
 
 class GraphIteratorNode(BaseNode):
@@ -51,13 +49,15 @@ def execute(self, state: dict) -> dict:
                             the correct data from the state.
 
         Returns:
-            dict: The updated state with the output key containing the results of the graph instances.
+            dict: The updated state with the output key c
+            ontaining the results of the graph instances.
 
         Raises:
-            KeyError: If the input keys are not found in the state, indicating that the
-                        necessary information for running the graph instances is missing.
+            KeyError: If the input keys are not found in the state, 
+            indicating that thenecessary information for running 
+            the graph instances is missing.
         """
-        batchsize = self.node_config.get("batchsize", _default_batchsize)
+        batchsize = self.node_config.get("batchsize", DEFAULT_BATCHSIZE)
 
         self.logger.info(
             f"--- Executing {self.node_name} Node with batchsize {batchsize} ---"
diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py
@@ -3,14 +3,14 @@
 """
 
 from typing import List, Optional
-
 from ..utils.logging import get_logger
 from .base_node import BaseNode
 
 
 class ImageToTextNode(BaseNode):
     """
-    Retrieve images from a list of URLs and return a description of the images using an image-to-text model.
+    Retrieve images from a list of URLs and return a description of 
+    the images using an image-to-text model.
 
     Attributes:
         llm_model: An instance of the language model client used for image-to-text conversion.
diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py
@@ -2,18 +2,10 @@
 MergeAnswersNode Module
 """
 
-# Imports from standard library
 from typing import List, Optional
-from tqdm import tqdm
-
-# Imports from Langchain
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
-from tqdm import tqdm
-
 from ..utils.logging import get_logger
-
-# Imports from the library
 from .base_node import BaseNode
 
 
diff --git a/scrapegraphai/nodes/merge_generated_scripts.py b/scrapegraphai/nodes/merge_generated_scripts.py
@@ -5,15 +5,9 @@
 # Imports from standard library
 from typing import List, Optional
 from tqdm import tqdm
-
-# Imports from Langchain
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
-from tqdm import tqdm
-
 from ..utils.logging import get_logger
-
-# Imports from the library
 from .base_node import BaseNode
 
 
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -75,23 +75,23 @@ def execute(self, state: dict) -> dict:
 
             chunks = chunk(text=docs_transformed.page_content,
                             chunk_size= self.node_config.get("chunk_size", 4096)-250,
-                            token_counter=lambda x: len(x),
+                            token_counter= lambda x: len(x),
                             memoize=False)
         else:
             docs_transformed = docs_transformed[0]
 
-            if type(docs_transformed) == Document:
+            if isinstance(docs_transformed, Document):
                 chunks = chunk(text=docs_transformed.page_content,
                             chunk_size= self.node_config.get("chunk_size", 4096)-250,
-                            token_counter=lambda x: len(x),
+                            token_counter= lambda x: len(x),
                             memoize=False)
             else:
-                
+
                 chunks = chunk(text=docs_transformed,
                                 chunk_size= self.node_config.get("chunk_size", 4096)-250,
-                                token_counter=lambda x: len(x),
+                                token_counter= lambda x: len(x),
                                 memoize=False)
-                          
+    
         state.update({self.output[0]: chunks})
 
         return state
diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py
@@ -4,15 +4,9 @@
 
 from typing import List, Optional
 from urllib.parse import urlparse
-
 from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain.prompts import PromptTemplate
 from langchain.output_parsers import CommaSeparatedListOutputParser
-
-from langchain.output_parsers import CommaSeparatedListOutputParser
-from langchain.prompts import PromptTemplate
-from langchain_community.document_loaders import AsyncChromiumLoader
-
 from ..helpers import robots_dictionary
 from ..utils.logging import get_logger
 from .base_node import BaseNode
@@ -146,4 +140,4 @@ def execute(self, state: dict) -> dict:
                 self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m")
 
         state.update({self.output[0]: is_scrapable})
-        return state
+        return state
diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py
@@ -1,9 +1,7 @@
 """
 SearchInternetNode Module
 """
-
 from typing import List, Optional
-
 from langchain.output_parsers import CommaSeparatedListOutputParser
 from langchain.prompts import PromptTemplate
 from langchain_community.chat_models import ChatOllama
diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py
@@ -2,19 +2,13 @@
 SearchLinkNode Module
 """
 
-# Imports from standard library
 from typing import List, Optional
 import re
 from tqdm import tqdm
-
-# Imports from Langchain
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
-
 from ..utils.logging import get_logger
-
-# Imports from the library
 from .base_node import BaseNode
 
 
diff --git a/scrapegraphai/nodes/search_node_with_context.py b/scrapegraphai/nodes/search_node_with_context.py
@@ -67,7 +67,6 @@ def execute(self, state: dict) -> dict:
         # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
 
-        user_prompt = input_data[0]
         doc = input_data[1]
 
         output_parser = CommaSeparatedListOutputParser()
diff --git a/scrapegraphai/nodes/text_to_speech_node.py b/scrapegraphai/nodes/text_to_speech_node.py
@@ -1,13 +1,10 @@
 """
 TextToSpeechNode Module
 """
-
 from typing import List, Optional
-
 from ..utils.logging import get_logger
 from .base_node import BaseNode
 
-
 class TextToSpeechNode(BaseNode):
     """
     Converts text to speech using the specified text-to-speech model.
diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py
@@ -1,8 +1,8 @@
 """
 convert_to_md modul
 """
-import html2text
 from urllib.parse import urlparse
+import html2text
 
 def convert_to_md(html: str, url: str = None) -> str:
     """ Convert HTML to Markdown.
diff --git a/scrapegraphai/utils/logging.py b/scrapegraphai/utils/logging.py
diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py
diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
diff --git a/scrapegraphai/utils/sys_dynamic_import.py b/scrapegraphai/utils/sys_dynamic_import.py
diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py