Merge pull request #217 from mayurdb/fetchLinkFix

VinciGit00 · web-flow · commit b752499fabfb · 2024-05-11T09:42:40.000+02:00
Fetch links in the page while parsing html
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -3,12 +3,13 @@
 """
 import pandas as pd
 import json
+import requests
 from typing import List, Optional
 from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.documents import Document
 from langchain_community.document_loaders import PyPDFLoader
 from .base_node import BaseNode
-from ..utils.remover import remover
+from ..utils.cleanup_html import cleanup_html
 
 
 class FetchNode(BaseNode):
@@ -38,6 +39,8 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict] =
             "headless", True)
         self.verbose = False if node_config is None else node_config.get(
             "verbose", False)
+        self.useSoup = True if node_config is None else node_config.get(
+            "useSoup", True)
 
     def execute(self, state):
         """
@@ -94,9 +97,17 @@ def execute(self, state):
             pass
 
         elif not source.startswith("http"):
-            compressed_document = [Document(page_content=remover(source), metadata={
+            compressed_document = [Document(page_content=cleanup_html(source), metadata={
                 "source": "local_dir"
             })]
+        
+        elif self.useSoup:
+            response = requests.get(source)
+            if response.status_code == 200:
+                cleanedup_html = cleanup_html(response.text, source)
+                compressed_document = [Document(page_content=cleanedup_html)]
+            else:	
+                print(f"Failed to retrieve contents from the webpage at url: {url}")
 
         else:
             if self.node_config is not None and self.node_config.get("endpoint") is not None:
@@ -114,7 +125,7 @@ def execute(self, state):
 
             document = loader.load()
             compressed_document = [
-                Document(page_content=remover(str(document[0].page_content)))]
+                Document(page_content=cleanup_html(str(document[0].page_content)))]
 
         state.update({self.output[0]: compressed_document})
         return state
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
@@ -6,4 +6,4 @@
 from .convert_to_json import convert_to_json
 from .prettify_exec_info import prettify_exec_info
 from .proxy_rotation import proxy_generator
-from .remover import remover
+from .cleanup_html import cleanup_html
diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
@@ -3,9 +3,9 @@
 """
 from bs4 import BeautifulSoup
 from minify_html import minify
+from urllib.parse import urljoin
 
-
-def remover(html_content: str) -> str:
+def cleanup_html(html_content: str, base_url: str) -> str:
     """
     Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
 
@@ -33,11 +33,21 @@ def remover(html_content: str) -> str:
     for tag in soup.find_all(['script', 'style']):
         tag.extract()
 
+    # Links extraction
+    links = soup.find_all('a')
+    link_urls = []
+    for link in links:
+        if 'href' in link.attrs:
+            link_urls.append(urljoin(base_url, link['href']))
+
     # Body Extraction (if it exists)
     body_content = soup.find('body')
     if body_content:
         # Minify the HTML within the body tag
         minimized_body = minify(str(body_content))
-        return "Title: " + title + ", Body: " + minimized_body
+        print("Came here")
+        return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls)
+
 
-    return "Title: " + title + ", Body: No body content found"
+    print("No Came here")
+    return "Title: " + title + ", Body: No body content found" + ", Links: " + str(link_urls)