feat: revert fetch_node

PeriniM · PeriniM · commit 864aa91326c3 · 2024-05-10T15:11:54.000+02:00
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -8,9 +8,7 @@
 from langchain_core.documents import Document
 from langchain_community.document_loaders import PyPDFLoader
 from .base_node import BaseNode
-from ..utils.cleanup_html import cleanup_html
-import requests
-from bs4 import BeautifulSoup
+from ..utils.remover import remover
 
 
 class FetchNode(BaseNode):
@@ -36,7 +34,6 @@ class FetchNode(BaseNode):
     def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
         super().__init__(node_name, "node", input, output, 1)
 
-
         self.headless = True if node_config is None else node_config.get(
             "headless", True)
         self.verbose = False if node_config is None else node_config.get(
@@ -97,22 +94,10 @@ def execute(self, state):
             pass
 
         elif not source.startswith("http"):
-            compressed_document = [Document(page_content=cleanup_html(source), metadata={
+            compressed_document = [Document(page_content=remover(source), metadata={
                 "source": "local_dir"
             })]
 
-        elif self.useSoup:
-            response = requests.get(source)
-            if response.status_code == 200:
-                soup = BeautifulSoup(response.text, 'html.parser')
-                links = soup.find_all('a')
-                link_urls = []
-                for link in links:
-                    if 'href' in link.attrs:
-                        link_urls.append(link['href'])
-                compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
-            else:
-                print(f"Failed to retrieve contents from the webpage at url: {url}")
         else:
             if self.node_config is not None and self.node_config.get("endpoint") is not None:
 
@@ -129,7 +114,7 @@ def execute(self, state):
 
             document = loader.load()
             compressed_document = [
-                Document(page_content=cleanup_html(str(document[0].page_content)))]
+                Document(page_content=remover(str(document[0].page_content)))]
 
         state.update({self.output[0]: compressed_document})
-        return state
+        return state
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
@@ -6,3 +6,4 @@
 from .convert_to_json import convert_to_json
 from .prettify_exec_info import prettify_exec_info
 from .proxy_rotation import proxy_generator
+from .remover import remover
diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/remover.py
@@ -5,7 +5,7 @@
 from minify_html import minify
 
 
-def cleanup_html(html_content: str, urls: list = []) -> str:
+def remover(html_content: str) -> str:
     """
     Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
 
@@ -17,7 +17,7 @@ def cleanup_html(html_content: str, urls: list = []) -> str:
 
     Example:
         >>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
-        >>> cleanup_html(html_content)
+        >>> remover(html_content)
         'Title: Example, Body: <body><p>Hello World!</p></body>'
 
     This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
@@ -35,12 +35,9 @@ def cleanup_html(html_content: str, urls: list = []) -> str:
 
     # Body Extraction (if it exists)
     body_content = soup.find('body')
-    urls_content = ""
-    if urls:
-        urls_content = f", URLs in page: {urls}"
     if body_content:
         # Minify the HTML within the body tag
         minimized_body = minify(str(body_content))
-        return "Title: " + title + ", Body: " + minimized_body + urls_content
+        return "Title: " + title + ", Body: " + minimized_body
 
-    return "Title: " + title + ", Body: No body content found" + urls_content
+    return "Title: " + title + ", Body: No body content found"