Merge pull request #756 from shenghongtw/pre/beta

VinciGit00 · web-flow · commit ffa1067f0dda · 2024-10-20T10:13:47.000+02:00
The smart_scraper_multi_graph method is too expensive
diff --git a/examples/openai/smart_scraper_multi_lite_openai.py b/examples/openai/smart_scraper_multi_lite_openai.py
@@ -0,0 +1,47 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperMultiLiteGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "model": "openai/gpt-4o",
+    },
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph(
+    prompt="Who is Marco Perini?",
+    source= [
+        "https://perinim.github.io/",
+        "https://perinim.github.io/cv/"
+    ],
+    config=graph_config
+)
+
+result = smart_scraper_multi_lite_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
@@ -25,3 +25,5 @@
 from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
 from .code_generator_graph import CodeGeneratorGraph
 from .depth_search_graph import DepthSearchGraph
+from .smart_scraper_multi_lite_graph import SmartScraperMultiLiteGraph
+from .scrape_graph import ScrapeGraph
diff --git a/scrapegraphai/graphs/scrape_graph.py b/scrapegraphai/graphs/scrape_graph.py
@@ -0,0 +1,98 @@
+"""
+SmartScraperGraph Module
+"""
+from typing import Optional
+from pydantic import BaseModel
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+from ..nodes import (
+    FetchNode,
+    ParseNode,
+)
+
+class ScrapeGraph(AbstractGraph):
+    """
+    ScrapeGraph is a scraping pipeline that automates the process of 
+    extracting information from web pages.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel): The schema for the graph output.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel): The schema for the graph output.
+
+    Example:
+        >>> scraper = ScraperGraph(
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "openai/gpt-3.5-turbo"}}
+        ... )
+        >>> result = smart_scraper.run()
+        )
+    """
+
+    def __init__(self, source: str, config: dict, prompt: str = "", schema: Optional[BaseModel] = None):
+        super().__init__(prompt, config, source, schema)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
+        """
+        fetch_node = FetchNode(
+            input="url| local_dir",
+            output=["doc"],
+            node_config={
+                "llm_model": self.llm_model,
+                "force": self.config.get("force", False),
+                "cut": self.config.get("cut", True),
+                "loader_kwargs": self.config.get("loader_kwargs", {}),
+                "browser_base": self.config.get("browser_base"),
+                "scrape_do": self.config.get("scrape_do")
+            }
+        )
+
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={
+                "llm_model": self.llm_model,
+                "chunk_size": self.model_token
+            }
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node,
+                parse_node,
+            ],
+            edges=[
+                (fetch_node, parse_node),
+            ],
+            entry_point=fetch_node,
+            graph_name=self.__class__.__name__
+        )
+
+    def run(self) -> str:
+        """
+        Executes the scraping process and returns the scraping content.
+
+        Returns:
+            str: The scraping content.
+        """
+
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("parsed_doc", "No document found.")
diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py
@@ -35,11 +35,11 @@ class SmartScraperMultiConcatGraph(AbstractGraph):
         schema (Optional[BaseModel]): The schema for the graph output.
 
     Example:
-        >>> search_graph = MultipleSearchGraph(
+        >>> smart_scraper_multi_concat_graph = SmartScraperMultiConcatGraph(
         ...     "What is Chioggia famous for?",
         ...     {"llm": {"model": "openai/gpt-3.5-turbo"}}
         ... )
-        >>> result = search_graph.run()
+        >>> result = smart_scraper_multi_concat_graph.run()
     """
 
     def __init__(self, prompt: str, source: List[str], 
diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py
@@ -18,6 +18,8 @@ class SmartScraperMultiGraph(AbstractGraph):
     SmartScraperMultiGraph is a scraping pipeline that scrapes a 
     list of URLs and generates answers to a given prompt.
     It only requires a user prompt and a list of URLs.
+    The difference with the SmartScraperMultiLiteGraph is that in this case the content will be abstracted
+    by llm and then merged finally passed to the llm.
 
     Attributes:
         prompt (str): The user prompt to search the internet.
@@ -34,11 +36,15 @@ class SmartScraperMultiGraph(AbstractGraph):
         schema (Optional[BaseModel]): The schema for the graph output.
 
     Example:
-        >>> search_graph = MultipleSearchGraph(
-        ...     "What is Chioggia famous for?",
-        ...     {"llm": {"model": "openai/gpt-3.5-turbo"}}
+        >>> smart_scraper_multi_graph = SmartScraperMultiGraph(
+        ...     prompt="Who is Marco Perini?",
+        ...     source= [
+        ...         "https://perinim.github.io/",
+        ...         "https://perinim.github.io/cv/"
+        ...     ],
+        ...     config={"llm": {"model": "openai/gpt-3.5-turbo"}}
         ... )
-        >>> result = search_graph.run()
+        >>> result = smart_scraper_multi_graph.run()
     """
 
     def __init__(self, prompt: str, source: List[str], 
diff --git a/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py
@@ -0,0 +1,103 @@
+""" 
+SmartScraperMultiGraph Module
+"""
+from copy import deepcopy
+from typing import List, Optional
+from pydantic import BaseModel
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+from .scrape_graph import ScrapeGraph
+from ..nodes import (
+    GraphIteratorNode,
+    MergeAnswersNode,
+)
+from ..utils.copy import safe_deepcopy
+
+class SmartScraperMultiLiteGraph(AbstractGraph):
+    """ 
+    SmartScraperMultiLiteGraph is a scraping pipeline that scrapes a 
+    list of URLs and merge the content first and finally generates answers to a given prompt.
+    It only requires a user prompt and a list of URLs.
+    The difference with the SmartScraperMultiGraph is that in this case the content is merged
+    before to be passed to the llm.
+
+    Attributes:
+        prompt (str): The user prompt to search the internet.
+        llm_model (dict): The configuration for the language model.
+        embedder_model (dict): The configuration for the embedder model.
+        headless (bool): A flag to run the browser in headless mode.
+        verbose (bool): A flag to display the execution information.
+        model_token (int): The token limit for the language model.
+
+    Args:
+        prompt (str): The user prompt to search the internet.
+        source (List[str]): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (Optional[BaseModel]): The schema for the graph output.
+
+    Example:
+        >>> smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph(
+        ...     prompt="Who is Marco Perini?",
+        ...     source= [
+        ...         "https://perinim.github.io/",
+        ...         "https://perinim.github.io/cv/"
+        ...     ],
+        ...     config={"llm": {"model": "openai/gpt-3.5-turbo"}}
+        ... )
+        >>> result = smart_scraper_multi_lite_graph.run()
+    """
+
+    def __init__(self, prompt: str, source: List[str], 
+                 config: dict, schema: Optional[BaseModel] = None):
+
+        self.copy_config = safe_deepcopy(config)
+        self.copy_schema = deepcopy(schema)
+        super().__init__(prompt, config, source, schema)
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping 
+        and parsing and then merge the content and generates answers to a given prompt.
+        """
+        graph_iterator_node = GraphIteratorNode(
+            input="user_prompt & urls",
+            output=["parsed_doc"],
+            node_config={
+                "graph_instance": ScrapeGraph,
+                "scraper_config": self.copy_config,
+            },
+            schema=self.copy_schema
+        )
+
+        merge_answers_node = MergeAnswersNode(
+            input="user_prompt & parsed_doc",
+            output=["answer"],
+            node_config={
+                "llm_model": self.llm_model,
+                "schema": self.copy_schema
+            }
+        )
+
+        return BaseGraph(
+            nodes=[
+                graph_iterator_node,
+                merge_answers_node,
+            ],
+            edges=[
+                (graph_iterator_node, merge_answers_node),
+            ],
+            entry_point=graph_iterator_node,
+            graph_name=self.__class__.__name__
+        )
+
+    def run(self) -> str:
+        """
+        Executes the web scraping and parsing process first and 
+        then concatenate the content and generates answers to a given prompt.
+
+        Returns:
+            str: The answer to the prompt.
+        """
+        inputs = {"user_prompt": self.prompt, "urls": self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+        return self.final_state.get("answer", "No answer found.")
diff --git a/tests/graphs/scrape_graph_test.py b/tests/graphs/scrape_graph_test.py
@@ -0,0 +1,50 @@
+"""
+Module for testing the scrape graph class
+"""
+
+import os
+import pytest
+import pandas as pd
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScrapeGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+@pytest.fixture
+def graph_config():
+    """Configuration of the graph"""
+    openai_key = os.getenv("OPENAI_APIKEY")
+    return {
+        "llm": {
+            "api_key": openai_key,
+            "model": "openai/gpt-3.5-turbo",
+        },
+        "verbose": True,
+        "headless": False,
+    }
+
+def test_scraping_pipeline(graph_config):
+    """Start of the scraping pipeline"""
+    scrape_graph = ScrapeGraph(
+        source="https://perinim.github.io/projects/",
+        config=graph_config,
+    )
+
+    result = scrape_graph.run()
+
+    assert result is not None
+    assert isinstance(result, list)
+
+def test_get_execution_info(graph_config):
+    """Get the execution info"""
+    scrape_graph = ScrapeGraph(
+        source="https://perinim.github.io/projects/",
+        config=graph_config,
+    )
+
+    scrape_graph.run()
+
+    graph_exec_info = scrape_graph.get_execution_info()
+
+    assert graph_exec_info is not None
diff --git a/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py b/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py