refactoring of the code

VinciGit00 · VinciGit00 · commit 91ede933f88f · 2024-10-10T14:08:13.000+02:00
diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
@@ -66,7 +66,7 @@ def _create_graph(self) -> BaseGraph:
             BaseGraph: A graph instance representing the web scraping workflow.
         """
 
-        if self.schema is None: 
+        if self.schema is None:
             raise KeyError("The schema is required for CodeGeneratorGraph")
 
         fetch_node = FetchNode(
diff --git a/scrapegraphai/integrations/burr_bridge.py b/scrapegraphai/integrations/burr_bridge.py
@@ -11,7 +11,8 @@
 try:
     import burr
     from burr import tracking
-    from burr.core import Application, ApplicationBuilder, State, Action, default, ApplicationContext
+    from burr.core import (Application, ApplicationBuilder,
+                            State, Action, default, ApplicationContext)
     from burr.lifecycle import PostRunStepHook, PreRunStepHook
 except ImportError:
     raise ImportError("""burr package is not installed. 
diff --git a/scrapegraphai/integrations/indexify_node.py b/scrapegraphai/integrations/indexify_node.py
@@ -50,21 +50,13 @@ def execute(self, state: dict) -> dict:
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        # Interpret input keys based on the provided input expression
-        # input_keys length matches the min_input_len parameter in the __init__ method
-        # e.g. "answer & parsed_doc" or "answer | img_urls"
-
         input_keys = self.get_input_keys(state)
 
-        # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
 
         answer = input_data[0]
         img_urls = input_data[1]
 
-        # Indexify the content
-        # ...
-
         isIndexified = True
         state.update({self.output[0]: isIndexified})
 
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -1,5 +1,5 @@
 """ 
-__init__.py file for node folder 
+__init__.py file for node folder module
 """
 
 from .base_node import BaseNode
@@ -19,7 +19,7 @@
 from .graph_iterator_node import GraphIteratorNode
 from .merge_answers_node import MergeAnswersNode
 from .generate_answer_omni_node import GenerateAnswerOmniNode
-from .merge_generated_scripts import MergeGeneratedScriptsNode
+from .merge_generated_scripts_node import MergeGeneratedScriptsNode
 from .fetch_screen_node import FetchScreenNode
 from .generate_answer_from_image_node import GenerateAnswerFromImageNode
 from .concat_answers_node import ConcatAnswersNode
@@ -32,4 +32,4 @@
 from .fetch_node_level_k import FetchNodeLevelK
 from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
 from .description_node import DescriptionNode
-from .parse_node_depth_k import ParseNodeDepthK
+from .parse_node_depth_k_node import ParseNodeDepthK
diff --git a/scrapegraphai/nodes/merge_generated_scripts_node.py b/scrapegraphai/nodes/merge_generated_scripts_node.py
@@ -2,9 +2,9 @@
 MergeAnswersNode Module
 """
 from typing import List, Optional
-from tqdm import tqdm
 from langchain.prompts import PromptTemplate
-from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
+from langchain_core.output_parsers import StrOutputParser
+from ..prompts import TEMPLATE_MERGE_SCRIPTS_PROMPT
 from ..utils.logging import get_logger
 from .base_node import BaseNode
 
@@ -51,10 +51,8 @@ def execute(self, state: dict) -> dict:
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        # Interpret input keys based on the provided input expression
         input_keys = self.get_input_keys(state)
 
-        # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
 
         user_prompt = input_data[0]
@@ -67,20 +65,8 @@ def execute(self, state: dict) -> dict:
             scripts_str += "-----------------------------------\n"
             scripts_str += script
 
-        TEMPLATE_MERGE = """
-        You are a python expert in web scraping and you have just generated multiple scripts to scrape different URLs.\n
-        The scripts are generated based on a user question and the content of the websites.\n
-        You need to create one single script that merges the scripts generated for each URL.\n
-        The scraped contents are in a JSON format and you need to merge them based on the context and providing a correct JSON structure.\n
-        The output should be just in python code without any comment and should implement the main function.\n
-        The python script, when executed, should format the extracted information sticking to the user question and scripts output format.\n
-        USER PROMPT: {user_prompt}\n
-        SCRIPTS:\n
-        {scripts}
-        """
-
         prompt_template = PromptTemplate(
-            template=TEMPLATE_MERGE,
+            template=TEMPLATE_MERGE_SCRIPTS_PROMPT,
             input_variables=["user_prompt"],
             partial_variables={
                 "scripts": scripts_str,
@@ -90,6 +76,5 @@ def execute(self, state: dict) -> dict:
         merge_chain = prompt_template | self.llm_model | StrOutputParser()
         answer = merge_chain.invoke({"user_prompt": user_prompt})
 
-        # Update the state with the generated answer
         state.update({self.output[0]: answer})
         return state
diff --git a/scrapegraphai/nodes/parse_node_depth_k_node.py b/scrapegraphai/nodes/parse_node_depth_k_node.py
diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py
@@ -2,21 +2,37 @@
 __init__.py for the prompts folder
 """
 
-from .generate_answer_node_prompts import TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD
-from .generate_answer_node_csv_prompts import TEMPLATE_CHUKS_CSV, TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV  
-from .generate_answer_node_pdf_prompts import TEMPLATE_CHUNKS_PDF, TEMPLATE_NO_CHUNKS_PDF, TEMPLATE_MERGE_PDF
-from .generate_answer_node_omni_prompts import TEMPLATE_CHUNKS_OMNI, TEMPLATE_NO_CHUNKS_OMNI, TEMPLATE_MERGE_OMNI
+from .generate_answer_node_prompts import   (TEMPLATE_CHUNKS,
+                                            TEMPLATE_NO_CHUNKS,
+                                            TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD,
+                                            TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD)
+from .generate_answer_node_csv_prompts import (TEMPLATE_CHUKS_CSV,
+                                               TEMPLATE_NO_CHUKS_CSV,
+                                               TEMPLATE_MERGE_CSV)
+from .generate_answer_node_pdf_prompts import (TEMPLATE_CHUNKS_PDF,
+                                               TEMPLATE_NO_CHUNKS_PDF,
+                                               TEMPLATE_MERGE_PDF)
+from .generate_answer_node_omni_prompts import (TEMPLATE_CHUNKS_OMNI,
+                                                TEMPLATE_NO_CHUNKS_OMNI,
+                                                TEMPLATE_MERGE_OMNI)
 from .merge_answer_node_prompts import TEMPLATE_COMBINED
 from .robots_node_prompts import TEMPLATE_ROBOT
 from .search_internet_node_prompts import TEMPLATE_SEARCH_INTERNET
 from .search_link_node_prompts import TEMPLATE_RELEVANT_LINKS
-from .search_node_with_context_prompts import TEMPLATE_SEARCH_WITH_CONTEXT_CHUNKS, TEMPLATE_SEARCH_WITH_CONTEXT_NO_CHUNKS
+from .search_node_with_context_prompts import (TEMPLATE_SEARCH_WITH_CONTEXT_CHUNKS, 
+                                               TEMPLATE_SEARCH_WITH_CONTEXT_NO_CHUNKS)
 from .prompt_refiner_node_prompts import TEMPLATE_REFINER, TEMPLATE_REFINER_WITH_CONTEXT
 from .html_analyzer_node_prompts import TEMPLATE_HTML_ANALYSIS, TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT
 from .generate_code_node_prompts import (TEMPLATE_INIT_CODE_GENERATION,
-                                         TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_SYNTAX_CODE_GENERATION,
-                                         TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION,
-                                         TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION,
-                                         TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS,
+                                         TEMPLATE_SYNTAX_ANALYSIS,
+                                         TEMPLATE_SYNTAX_CODE_GENERATION,
+                                         TEMPLATE_EXECUTION_ANALYSIS,
+                                         TEMPLATE_EXECUTION_CODE_GENERATION,
+                                         TEMPLATE_VALIDATION_ANALYSIS,
+                                         TEMPLATE_VALIDATION_CODE_GENERATION,
+                                         TEMPLATE_SEMANTIC_COMPARISON,
+                                         TEMPLATE_SEMANTIC_ANALYSIS,
                                          TEMPLATE_SEMANTIC_CODE_GENERATION)
-from .reasoning_node_prompts import TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT
+from .reasoning_node_prompts import (TEMPLATE_REASONING,
+                                     TEMPLATE_REASONING_WITH_CONTEXT)
+from .merge_generated_scripts_prompts import TEMPLATE_MERGE_SCRIPTS_PROMPT
diff --git a/scrapegraphai/prompts/merge_generated_scripts_prompts.py b/scrapegraphai/prompts/merge_generated_scripts_prompts.py
@@ -0,0 +1,14 @@
+"""
+merge_generated_scripts_prompts module
+"""
+TEMPLATE_MERGE_SCRIPTS_PROMPT = """
+You are a python expert in web scraping and you have just generated multiple scripts to scrape different URLs.\n
+The scripts are generated based on a user question and the content of the websites.\n
+You need to create one single script that merges the scripts generated for each URL.\n
+The scraped contents are in a JSON format and you need to merge them based on the context and providing a correct JSON structure.\n
+The output should be just in python code without any comment and should implement the main function.\n
+The python script, when executed, should format the extracted information sticking to the user question and scripts output format.\n
+USER PROMPT: {user_prompt}\n
+SCRIPTS:\n
+{scripts}
+"""