add multi scraper integration

VinciGit00 · VinciGit00 · commit 1981230e6fb8 · 2024-06-08T12:13:18.000+02:00
diff --git a/examples/openai/script_multi_generator_openai.py b/examples/openai/script_multi_generator_openai.py
@@ -0,0 +1,54 @@
+""" 
+Basic example of scraping pipeline using ScriptCreatorGraph
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorMultiGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-3.5-turbo",
+    },
+    "library": "beautifulsoup"
+}
+
+# ************************************************
+# Create the ScriptCreatorGraph instance and run it
+# ************************************************
+
+urls=[
+    "https://schultzbergagency.com/emil-raste-karlsen/",
+    "https://schultzbergagency.com/johanna-hedberg/",
+]
+
+# ************************************************
+# Create the ScriptCreatorGraph instance and run it
+# ************************************************
+
+script_creator_graph = ScriptCreatorMultiGraph(
+    prompt="Find information about actors",
+    # also accepts a string with the already downloaded HTML code
+    source=urls,
+    config=graph_config
+)
+
+result = script_creator_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = script_creator_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
@@ -20,3 +20,4 @@
 from .json_scraper_multi import JSONScraperMultiGraph
 from .csv_scraper_graph_multi import CSVScraperMultiGraph
 from .xml_scraper_graph_multi import XMLScraperMultiGraph
+from .script_creator_multi_graph import ScriptCreatorMultiGraph
diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py
@@ -0,0 +1,114 @@
+""" 
+ScriptCreatorMultiGraph Module
+"""
+
+from copy import copy, deepcopy
+from typing import List, Optional
+
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+from .script_creator_graph import ScriptCreatorGraph
+
+from ..nodes import (
+    GraphIteratorNode,
+    MergeGeneratedScriptsNode
+)
+
+
+class ScriptCreatorMultiGraph(AbstractGraph):
+    """ 
+    ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list of URLs generating web scraping scripts.
+    It only requires a user prompt and a list of URLs.
+    Attributes:
+        prompt (str): The user prompt to search the internet.
+        llm_model (dict): The configuration for the language model.
+        embedder_model (dict): The configuration for the embedder model.
+        headless (bool): A flag to run the browser in headless mode.
+        verbose (bool): A flag to display the execution information.
+        model_token (int): The token limit for the language model.
+    Args:
+        prompt (str): The user prompt to search the internet.
+        source (List[str]): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (Optional[str]): The schema for the graph output.
+    Example:
+        >>> script_graph = ScriptCreatorMultiGraph(
+        ...     "What is Chioggia famous for?",
+        ...     source=[],
+        ...     config={"llm": {"model": "gpt-3.5-turbo"}}
+        ...     schema={}
+        ... )
+        >>> result = script_graph.run()
+    """
+
+    def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
+
+        self.max_results = config.get("max_results", 3)
+
+        if all(isinstance(value, str) for value in config.values()):
+            self.copy_config = copy(config)
+        else:
+            self.copy_config = deepcopy(config)
+
+        super().__init__(prompt, config, source, schema)
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping and searching.
+        Returns:
+            BaseGraph: A graph instance representing the web scraping and searching workflow.
+        """
+
+        # ************************************************
+        # Create a ScriptCreatorGraph instance
+        # ************************************************
+
+        script_generator_instance = ScriptCreatorGraph(
+            prompt="",
+            source="",
+            config=self.copy_config,
+        )
+
+        # ************************************************
+        # Define the graph nodes
+        # ************************************************
+
+        graph_iterator_node = GraphIteratorNode(
+            input="user_prompt & urls",
+            output=["results"],
+            node_config={
+                "graph_instance": script_generator_instance,
+            }
+        )
+
+        merge_scripts_node = MergeGeneratedScriptsNode(
+            input="user_prompt & results",
+            output=["scripts"],
+            node_config={
+                "llm_model": self.llm_model,
+                "schema": self.schema
+            }
+        )
+
+        return BaseGraph(
+            nodes=[
+                graph_iterator_node,
+                merge_scripts_node,
+            ],
+            edges=[
+                (graph_iterator_node, merge_scripts_node),
+            ],
+            entry_point=graph_iterator_node
+        )
+
+    def run(self) -> str:
+        """
+        Executes the web scraping and searching process.
+        Returns:
+            str: The answer to the prompt.
+        """
+        inputs = {"user_prompt": self.prompt, "urls": self.source}
+        print("self.prompt", self.prompt)
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+        print("self.prompt", self.final_state)
+        return self.final_state.get("scripts", [])
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -20,3 +20,4 @@
 from .graph_iterator_node import GraphIteratorNode
 from .merge_answers_node import MergeAnswersNode
 from .generate_answer_omni_node import GenerateAnswerOmniNode
+from .merge_generated_scripts import MergeGeneratedScriptsNode 
diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py
@@ -100,7 +100,7 @@ def execute(self, state: dict) -> dict:
         SOURCE: {source}
         QUESTION: {question}
         """
-        print("source:", self.source)
+
         if len(doc) > 1:
             raise NotImplementedError(
                 "Currently GenerateScraperNode cannot handle more than 1 context chunks"
diff --git a/scrapegraphai/nodes/merge_generated_scripts.py b/scrapegraphai/nodes/merge_generated_scripts.py
@@ -0,0 +1,80 @@
+"""
+MergeAnswersNode Module
+"""
+
+# Imports from standard library
+from typing import List, Optional
+from tqdm import tqdm
+
+# Imports from Langchain
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from tqdm import tqdm
+
+from ..utils.logging import get_logger
+
+# Imports from the library
+from .base_node import BaseNode
+
+
+class MergeGeneratedScriptsNode(BaseNode):
+    """
+    A node responsible for merging scripts generated.
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "MergeAnswers",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+
+    def execute(self, state: dict) -> dict:
+        """
+        Executes the node's logic to merge the answers from multiple graph instances into a
+        single answer.
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data from the state.
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating
+                      that the necessary information for generating an answer is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        # Interpret input keys based on the provided input expression
+        input_keys = self.get_input_keys(state)
+
+        # Fetching data from the state based on the input keys
+        input_data = [state[key] for key in input_keys]
+
+        scripts = input_data[1]
+
+        # merge the answers in one string
+        for i, script_str in enumerate(scripts):
+            print(f"Script #{i}")
+            print("=" * 40)
+            print(script_str)
+            print("-" * 40)
+
+        # Update the state with the generated answer
+        state.update({self.output[0]: scripts})
+        return state