Merge pull request #717 from vedovati-matteo/deep_scraper_integration

VinciGit00 · web-flow · commit 17c51457df8a · 2024-10-02T13:48:54.000+02:00
Fetch_node_level_k and parse_node_depth_k added
diff --git a/examples/openai/fetch_multiple_links.py b/examples/openai/fetch_multiple_links.py
@@ -0,0 +1,22 @@
+
+from scrapegraphai.graphs import DepthSearchGraph
+
+graph_config = {
+    "llm": {
+        "api_key":"YOUR_API_KEY",
+        "model": "openai/gpt-4o-mini",
+    },
+    "verbose": True,
+    "headless": False,
+    "depth": 2,
+    "only_inside_links": True,
+}
+
+search_graph = DepthSearchGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
@@ -26,3 +26,4 @@
 from .screenshot_scraper_graph import ScreenshotScraperGraph
 from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
 from .code_generator_graph import CodeGeneratorGraph
+from .depth_search_graph import DepthSearchGraph
diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py
@@ -0,0 +1,109 @@
+"""
+... Module
+"""
+from typing import Optional
+import logging
+from pydantic import BaseModel
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+from ..utils.save_code_to_file import save_code_to_file
+from ..nodes import (
+    FetchNodeLevelK,
+    ParseNodeDepthK
+)
+
+class DepthSearchGraph(AbstractGraph):
+    """
+    CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for
+    extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
+    It requires a user prompt, a source URL, and an output schema.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel): The schema for the graph output.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client,
+        configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+        library (str): The library used for web scraping (beautiful soup).
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel): The schema for the graph output.
+
+    Example:
+        >>> code_gen = CodeGeneratorGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "openai/gpt-3.5-turbo"}}
+        ... )
+        >>> result = code_gen.run()
+        )
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
+
+        super().__init__(prompt, config, source, schema)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
+        """
+
+        fetch_node = FetchNodeLevelK(
+            input="url| local_dir",
+            output=["docs"],
+            node_config={
+                "loader_kwargs": self.config.get("loader_kwargs", {}),
+                "force": self.config.get("force", False),
+                "cut": self.config.get("cut", True),
+                "browser_base": self.config.get("browser_base"),
+                "depth": self.config.get("depth", 1),
+                "only_inside_links": self.config.get("only_inside_links", False)
+            }
+        )
+        
+        parse_node = ParseNodeDepthK(
+            input="docs",
+            output=["docs"],
+            node_config={
+                "verbose": self.config.get("verbose", False)
+            }
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node,
+                parse_node
+            ],
+            edges=[
+                (fetch_node, parse_node),
+            ],
+            entry_point=fetch_node,
+            graph_name=self.__class__.__name__
+        )
+
+    def run(self) -> str:
+        """
+        Executes the scraping process and returns the generated code.
+
+        Returns:
+            str: The generated code.
+        """
+
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        docs = self.final_state.get("docs", "No docs")
+
+        return docs
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -28,6 +28,7 @@
 from .generate_code_node import GenerateCodeNode
 from .search_node_with_context import SearchLinksWithContext
 from .reasoning_node import ReasoningNode
-from .fetch_node_level_k import FetchNodelevelK
+from .fetch_node_level_k import FetchNodeLevelK
 from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
 from .description_node import DescriptionNode
+from .parse_node_depth_k import ParseNodeDepthK
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -1,15 +1,21 @@
 """
-FetchNodelevelK Module
+FetchNodeLevelK Module
 """
 from typing import List, Optional
 from .base_node import BaseNode
+from ..docloaders import ChromiumLoader
+from ..utils.cleanup_html import cleanup_html
+from ..utils.convert_to_md import convert_to_md
+from langchain_core.documents import Document
+from bs4 import BeautifulSoup
+from urllib.parse import quote, urljoin
 
-class FetchNodelevelK(BaseNode):
+class FetchNodeLevelK(BaseNode):
     """
-    A node responsible for compressing the input tokens and storing the document
-    in a vector database for retrieval. Relevant chunks are stored in the state.
-
-    It allows scraping of big documents without exceeding the token limit of the language model.
+    A node responsible for fetching the HTML content of a specified URL and all its sub-links 
+    recursively up to a certain level of hyperlink the graph. This content is then used to update
+    the graph's state. It uses ChromiumLoader to fetch the content from a web page asynchronously
+    (with proxy protection).
 
     Attributes:
         llm_model: An instance of a language model client, configured for generating answers.
@@ -27,16 +33,158 @@ def __init__(
         input: str,
         output: List[str],
         node_config: Optional[dict] = None,
-        node_name: str = "RAG",
+        node_name: str = "FetchLevelK",
     ):
         super().__init__(node_name, "node", input, output, 2, node_config)
-
-        self.llm_model = node_config["llm_model"]
+        
         self.embedder_model = node_config.get("embedder_model", None)
+        
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
         )
+        
         self.cache_path = node_config.get("cache_path", False)
+        
+        self.headless = (
+            True if node_config is None else node_config.get("headless", True)
+        )
+        
+        self.loader_kwargs = (
+            {} if node_config is None else node_config.get("loader_kwargs", {})
+        )
+        
+        self.browser_base = (
+            None if node_config is None else node_config.get("browser_base", None)
+        )
+        
+        self.depth = (
+            1 if node_config is None else node_config.get("depth", 1)
+        )
+        
+        self.only_inside_links = (
+            False if node_config is None else node_config.get("only_inside_links", False)
+        )
+        
+        self.min_input_len = 1
 
     def execute(self, state: dict) -> dict:
-        pass
+        """
+        Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links
+        and update the graph's state with the content.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data types from the state.
+
+        Returns:
+            dict: The updated state with a new output key containing the fetched HTML content.
+
+        Raises:
+            KeyError: If the input key is not found in the state, indicating that the
+                    necessary information to perform the operation is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+        
+        # Interpret input keys based on the provided input expression
+        input_keys = self.get_input_keys(state)
+        # Fetching data from the state based on the input keys
+        input_data = [state[key] for key in input_keys]
+
+        source = input_data[0]
+        
+        documents = [{"source": source}]
+        
+        loader_kwargs = {}
+
+        if self.node_config is not None:
+            loader_kwargs = self.node_config.get("loader_kwargs", {})
+        
+        for _ in range(self.depth):
+            documents = self.obtain_content(documents, loader_kwargs)
+        
+        filtered_documents = [doc for doc in documents if 'document' in doc]
+        
+        state.update({self.output[0]: filtered_documents})
+        
+        return state
+    
+    def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
+        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+        
+        if self.browser_base is not None:
+            try:
+                from ..docloaders.browser_base import browser_base_fetch
+            except ImportError:
+                raise ImportError("""The browserbase module is not installed. 
+                                    Please install it using `pip install browserbase`.""")
+
+            data =  browser_base_fetch(self.browser_base.get("api_key"),
+                                        self.browser_base.get("project_id"), [source])
+
+            document = [Document(page_content=content,
+                                metadata={"source": source}) for content in data]
+        
+        else:
+            loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
+            
+            document = loader.load()
+        
+        return document
+    
+    def extract_links(self, html_content: str) -> list:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        links = [link['href'] for link in soup.find_all('a', href=True)]
+        self.logger.info(f"Extracted {len(links)} links.")
+        return links
+    
+    def get_full_links(self, base_url: str, links: list) -> list:
+        full_links = []
+        for link in links:
+            if self.only_inside_links and link.startswith("http"):
+                continue
+            full_link = link if link.startswith("http") else urljoin(base_url, link)
+            full_links.append(full_link)
+        return full_links
+    
+    def obtain_content(self, documents: List, loader_kwargs) -> List:
+        new_documents = []
+        for doc in documents:
+            source = doc['source']
+            if 'document' not in doc:
+                document = self.fetch_content(source, loader_kwargs)
+                
+                if not document or not document[0].page_content.strip():
+                    self.logger.warning(f"Failed to fetch content for {source}")
+                    documents.remove(doc)
+                    continue
+                
+                #doc['document'] = document[0].page_content
+                doc['document'] = document
+                
+                links = self.extract_links(doc['document'][0].page_content)
+                full_links = self.get_full_links(source, links)
+                
+                # Check if the links are already present in other documents
+                for link in full_links:
+                    # Check if any document is from the same link
+                    if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents):
+                        # Add the document
+                        new_documents.append({"source": link})
+        
+        documents.extend(new_documents)
+        return documents
+    
+    def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict:
+        content_dict = {}
+        for idx, link in enumerate(links, start=1):
+            full_link = link if link.startswith("http") else urljoin(base_url, link)
+            self.logger.info(f"Processing link {idx}: {full_link}")
+            link_content = self.fetch_content(full_link, loader_kwargs)
+
+            if current_depth < depth:
+                new_links = self.extract_links(link_content)
+                content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1))
+            else:
+                self.logger.warning(f"Failed to fetch content for {full_link}")
+        return content_dict
diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py
diff --git a/scrapegraphai/utils/1_manual.py b/scrapegraphai/utils/1_manual.py