DAGWorks-Inc
diff --git a/‎scrapegraphai/nodes/blocks_identifier.py
Lines changed: 18 additions & 8 deletions b/‎scrapegraphai/nodes/blocks_identifier.py
Lines changed: 18 additions & 8 deletions
diff --git a/‎scrapegraphai/nodes/fetch_node.py
Lines changed: 33 additions & 20 deletions b/‎scrapegraphai/nodes/fetch_node.py
Lines changed: 33 additions & 20 deletions
diff --git a/‎scrapegraphai/nodes/generate_answer_csv_node.py
Lines changed: 31 additions & 19 deletions b/‎scrapegraphai/nodes/generate_answer_csv_node.py
Lines changed: 31 additions & 19 deletions
@@ -3,33 +3,44 @@
 """
 
 from typing import List, Optional
+
 from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.documents import Document
-from .base_node import BaseNode
 
+from .base_node import BaseNode
 
 
 class BlocksIndentifier(BaseNode):
     """
     A node responsible to identify the blocks in the HTML content of a specified HTML content
-    e.g products in a E-commerce, flights in a travel website etc. 
+    e.g products in a E-commerce, flights in a travel website etc.
 
     Attributes:
         headless (bool): A flag indicating whether the browser should run in headless mode.
         verbose (bool): A flag indicating whether to print verbose output during execution.
-    
+
     Args:
         input (str): Boolean expression defining the input keys needed from the state.
         output (List[str]): List of output keys to be updated in the state.
         node_config (Optional[dict]): Additional configuration for the node.
         node_name (str): The unique identifier name for the node, defaulting to "BlocksIndentifier".
     """
 
-    def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "BlocksIndentifier"):
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict],
+        node_name: str = "BlocksIndentifier",
+    ):
         super().__init__(node_name, "node", input, output, 1)
 
-        self.headless = True if node_config is None else node_config.get("headless", True)
-        self.verbose = True if node_config is None else node_config.get("verbose", False)
+        self.headless = (
+            True if node_config is None else node_config.get("headless", True)
+        )
+        self.verbose = (
+            True if node_config is None else node_config.get("verbose", False)
+        )
 
     def execute(self, state):
         """
@@ -47,8 +58,7 @@ def execute(self, state):
             KeyError: If the input key is not found in the state, indicating that the
                     necessary information to perform the operation is missing.
         """
-        if self.verbose:
-            print(f"--- Executing {self.node_name} Node ---")
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
 
         # Interpret input keys based on the provided input expression
         input_keys = self.get_input_keys(state)
 
@@ -3,17 +3,18 @@
 """
 
 import json
-import requests
 from typing import List, Optional
 
 import pandas as pd
+import requests
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.documents import Document
 
 from ..docloaders import ChromiumLoader
-from .base_node import BaseNode
 from ..utils.cleanup_html import cleanup_html
 from ..utils.logging import get_logger
+from .base_node import BaseNode
+
 
 class FetchNode(BaseNode):
     """
@@ -51,7 +52,7 @@ def __init__(
             False if node_config is None else node_config.get("verbose", False)
         )
         self.useSoup = (
-          False if node_config is None else node_config.get("useSoup", False)
+            False if node_config is None else node_config.get("useSoup", False)
         )
         self.loader_kwargs = (
             {} if node_config is None else node_config.get("loader_kwargs", {})
@@ -73,8 +74,8 @@ def execute(self, state):
             KeyError: If the input key is not found in the state, indicating that the
                     necessary information to perform the operation is missing.
         """
-    
-        logger.info(f"--- Executing {self.node_name} Node ---")
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
 
         # Interpret input keys based on the provided input expression
         input_keys = self.get_input_keys(state)
@@ -92,7 +93,7 @@ def execute(self, state):
             ]
             state.update({self.output[0]: compressed_document})
             return state
-        
+
         # handling for pdf
         elif input_keys[0] == "pdf":
             loader = PyPDFLoader(source)
@@ -108,15 +109,15 @@ def execute(self, state):
             ]
             state.update({self.output[0]: compressed_document})
             return state
-        
+
         elif input_keys[0] == "json":
             f = open(source)
             compressed_document = [
                 Document(page_content=str(json.load(f)), metadata={"source": "json"})
             ]
             state.update({self.output[0]: compressed_document})
             return state
-        
+
         elif input_keys[0] == "xml":
             with open(source, "r", encoding="utf-8") as f:
                 data = f.read()
@@ -125,25 +126,29 @@ def execute(self, state):
             ]
             state.update({self.output[0]: compressed_document})
             return state
-        
+
         elif self.input == "pdf_dir":
             pass
 
         elif not source.startswith("http"):
             title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
             parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
-            compressed_document = [Document(page_content=parsed_content,
-                                            metadata={"source": "local_dir"}
-                                           )]
-        
+            compressed_document = [
+                Document(page_content=parsed_content, metadata={"source": "local_dir"})
+            ]
+
         elif self.useSoup:
             response = requests.get(source)
             if response.status_code == 200:
-                title, minimized_body, link_urls, image_urls = cleanup_html(response.text, source)
+                title, minimized_body, link_urls, image_urls = cleanup_html(
+                    response.text, source
+                )
                 parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
                 compressed_document = [Document(page_content=parsed_content)]
-            else:	
-                self.logger.warning(f"Failed to retrieve contents from the webpage at url: {source}")
+            else:
+                self.logger.warning(
+                    f"Failed to retrieve contents from the webpage at url: {source}"
+                )
 
         else:
             loader_kwargs = {}
@@ -153,14 +158,22 @@ def execute(self, state):
 
             loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
             document = loader.load()
-            
-            title, minimized_body, link_urls, image_urls = cleanup_html(str(document[0].page_content), source)
+
+            title, minimized_body, link_urls, image_urls = cleanup_html(
+                str(document[0].page_content), source
+            )
             parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
-            
+
             compressed_document = [
                 Document(page_content=parsed_content, metadata={"source": source})
             ]
 
-        state.update({self.output[0]: compressed_document, self.output[1]: link_urls, self.output[2]: image_urls})
+        state.update(
+            {
+                self.output[0]: compressed_document,
+                self.output[1]: link_urls,
+                self.output[2]: image_urls,
+            }
+        )
 
         return state
@@ -2,14 +2,16 @@
 gg
 Module for generating the answer node
 """
+
 # Imports from standard library
 from typing import List, Optional
-from tqdm import tqdm
 
 # Imports from Langchain
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
+from tqdm import tqdm
+
 from ..utils.logging import get_logger
 
 # Imports from the library
@@ -25,24 +27,29 @@ class GenerateAnswerCSVNode(BaseNode):
 
     Attributes:
         llm_model: An instance of a language model client, configured for generating answers.
-        node_name (str): The unique identifier name for the node, defaulting 
+        node_name (str): The unique identifier name for the node, defaulting
         to "GenerateAnswerNodeCsv".
-        node_type (str): The type of the node, set to "node" indicating a 
+        node_type (str): The type of the node, set to "node" indicating a
         standard operational node.
 
     Args:
-        llm_model: An instance of the language model client (e.g., ChatOpenAI) used 
+        llm_model: An instance of the language model client (e.g., ChatOpenAI) used
         for generating answers.
-        node_name (str, optional): The unique identifier name for the node. 
+        node_name (str, optional): The unique identifier name for the node.
         Defaults to "GenerateAnswerNodeCsv".
 
     Methods:
         execute(state): Processes the input and document from the state to generate an answer,
                         updating the state with the generated answer under the 'answer' key.
     """
 
-    def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None,
-                 node_name: str = "GenerateAnswer"):
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "GenerateAnswer",
+    ):
         """
         Initializes the GenerateAnswerNodeCsv with a language model client and a node name.
         Args:
@@ -51,8 +58,9 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict] =
         """
         super().__init__(node_name, "node", input, output, 2, node_config)
         self.llm_model = node_config["llm_model"]
-        self.verbose = False if node_config is None else node_config.get(
-            "verbose", False)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
 
     def execute(self, state):
         """
@@ -73,8 +81,7 @@ def execute(self, state):
                       that the necessary information for generating an answer is missing.
         """
 
-        if self.verbose:
-            self.logger.info(f"--- Executing {self.node_name} Node ---")
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
 
         # Interpret input keys based on the provided input expression
         input_keys = self.get_input_keys(state)
@@ -122,21 +129,27 @@ def execute(self, state):
         chains_dict = {}
 
         # Use tqdm to add progress bar
-        for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)):
+        for i, chunk in enumerate(
+            tqdm(doc, desc="Processing chunks", disable=not self.verbose)
+        ):
             if len(doc) == 1:
                 prompt = PromptTemplate(
                     template=template_no_chunks,
                     input_variables=["question"],
-                    partial_variables={"context": chunk.page_content,
-                                       "format_instructions": format_instructions},
+                    partial_variables={
+                        "context": chunk.page_content,
+                        "format_instructions": format_instructions,
+                    },
                 )
             else:
                 prompt = PromptTemplate(
                     template=template_chunks,
                     input_variables=["question"],
-                    partial_variables={"context": chunk.page_content,
-                                       "chunk_id": i + 1,
-                                       "format_instructions": format_instructions},
+                    partial_variables={
+                        "context": chunk.page_content,
+                        "chunk_id": i + 1,
+                        "format_instructions": format_instructions,
+                    },
                 )
 
             # Dynamically name the chains based on their index
@@ -155,8 +168,7 @@ def execute(self, state):
                 partial_variables={"format_instructions": format_instructions},
             )
             merge_chain = merge_prompt | self.llm_model | output_parser
-            answer = merge_chain.invoke(
-                {"context": answer, "question": user_prompt})
+            answer = merge_chain.invoke({"context": answer, "question": user_prompt})
         else:
             # Chain
             single_chain = list(chains_dict.values())[0]