Skip to content

Commit f8ce3d5

Browse files
committed
fix: Augment the information getting fetched from a webpage
1 parent 0ca52b1 commit f8ce3d5

File tree

2 files changed

+25
-7
lines changed

2 files changed

+25
-7
lines changed

scrapegraphai/nodes/fetch_node.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
from langchain_community.document_loaders import AsyncChromiumLoader
77
from langchain_core.documents import Document
88
from .base_node import BaseNode
9-
from ..utils.remover import remover
9+
from ..utils.cleanup_html import cleanup_html
10+
import requests
11+
from bs4 import BeautifulSoup
1012

1113

1214
class FetchNode(BaseNode):
@@ -32,6 +34,7 @@ class FetchNode(BaseNode):
3234
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"):
3335
super().__init__(node_name, "node", input, output, 1)
3436

37+
self.useSoup = True if node_config is None else node_config.get("useSoup", True)
3538
self.headless = True if node_config is None else node_config.get("headless", True)
3639
self.verbose = False if node_config is None else node_config.get("verbose", False)
3740

@@ -67,10 +70,22 @@ def execute(self, state):
6770
})]
6871
# if it is a local directory
6972
elif not source.startswith("http"):
70-
compressed_document = [Document(page_content=remover(source), metadata={
73+
compressed_document = [Document(page_content=cleanup_html(source), metadata={
7174
"source": "local_dir"
7275
})]
7376

77+
elif self.useSoup:
78+
response = requests.get(source)
79+
if response.status_code == 200:
80+
soup = BeautifulSoup(response.text, 'html.parser')
81+
links = soup.find_all('a')
82+
link_urls = []
83+
for link in links:
84+
if 'href' in link.attrs:
85+
link_urls.append(link['href'])
86+
compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
87+
else:
88+
print(f"Failed to retrieve contents from the webpage at url: {url}")
7489
else:
7590
if self.node_config is not None and self.node_config.get("endpoint") is not None:
7691

@@ -87,7 +102,7 @@ def execute(self, state):
87102

88103
document = loader.load()
89104
compressed_document = [
90-
Document(page_content=remover(str(document[0].page_content)))]
105+
Document(page_content=cleanup_html(str(document[0].page_content)))]
91106

92107
state.update({self.output[0]: compressed_document})
93108
return state

scrapegraphai/utils/remover.py renamed to scrapegraphai/utils/cleanup_html.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from minify_html import minify
66

77

8-
def remover(html_content: str) -> str:
8+
def cleanup_html(html_content: str, urls: list = []) -> str:
99
"""
1010
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
1111
@@ -17,7 +17,7 @@ def remover(html_content: str) -> str:
1717
1818
Example:
1919
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
20-
>>> remover(html_content)
20+
>>> cleanup_html(html_content)
2121
'Title: Example, Body: <body><p>Hello World!</p></body>'
2222
2323
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
@@ -35,9 +35,12 @@ def remover(html_content: str) -> str:
3535

3636
# Body Extraction (if it exists)
3737
body_content = soup.find('body')
38+
urls_content = ""
39+
if urls:
40+
urls_content = f", URLs in page: {urls}"
3841
if body_content:
3942
# Minify the HTML within the body tag
4043
minimized_body = minify(str(body_content))
41-
return "Title: " + title + ", Body: " + minimized_body
44+
return "Title: " + title + ", Body: " + minimized_body + urls_content
4245

43-
return "Title: " + title + ", Body: No body content found"
46+
return "Title: " + title + ", Body: No body content found" + urls_content

0 commit comments

Comments
 (0)