Skip to content

Commit 864aa91

Browse files
committed
feat: revert fetch_node
1 parent 63c0dd9 commit 864aa91

File tree

3 files changed

+9
-26
lines changed

3 files changed

+9
-26
lines changed

scrapegraphai/nodes/fetch_node.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@
88
from langchain_core.documents import Document
99
from langchain_community.document_loaders import PyPDFLoader
1010
from .base_node import BaseNode
11-
from ..utils.cleanup_html import cleanup_html
12-
import requests
13-
from bs4 import BeautifulSoup
11+
from ..utils.remover import remover
1412

1513

1614
class FetchNode(BaseNode):
@@ -36,7 +34,6 @@ class FetchNode(BaseNode):
3634
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
3735
super().__init__(node_name, "node", input, output, 1)
3836

39-
4037
self.headless = True if node_config is None else node_config.get(
4138
"headless", True)
4239
self.verbose = False if node_config is None else node_config.get(
@@ -97,22 +94,10 @@ def execute(self, state):
9794
pass
9895

9996
elif not source.startswith("http"):
100-
compressed_document = [Document(page_content=cleanup_html(source), metadata={
97+
compressed_document = [Document(page_content=remover(source), metadata={
10198
"source": "local_dir"
10299
})]
103100

104-
elif self.useSoup:
105-
response = requests.get(source)
106-
if response.status_code == 200:
107-
soup = BeautifulSoup(response.text, 'html.parser')
108-
links = soup.find_all('a')
109-
link_urls = []
110-
for link in links:
111-
if 'href' in link.attrs:
112-
link_urls.append(link['href'])
113-
compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
114-
else:
115-
print(f"Failed to retrieve contents from the webpage at url: {url}")
116101
else:
117102
if self.node_config is not None and self.node_config.get("endpoint") is not None:
118103

@@ -129,7 +114,7 @@ def execute(self, state):
129114

130115
document = loader.load()
131116
compressed_document = [
132-
Document(page_content=cleanup_html(str(document[0].page_content)))]
117+
Document(page_content=remover(str(document[0].page_content)))]
133118

134119
state.update({self.output[0]: compressed_document})
135-
return state
120+
return state

scrapegraphai/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
from .convert_to_json import convert_to_json
77
from .prettify_exec_info import prettify_exec_info
88
from .proxy_rotation import proxy_generator
9+
from .remover import remover

scrapegraphai/utils/cleanup_html.py renamed to scrapegraphai/utils/remover.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from minify_html import minify
66

77

8-
def cleanup_html(html_content: str, urls: list = []) -> str:
8+
def remover(html_content: str) -> str:
99
"""
1010
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
1111
@@ -17,7 +17,7 @@ def cleanup_html(html_content: str, urls: list = []) -> str:
1717
1818
Example:
1919
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
20-
>>> cleanup_html(html_content)
20+
>>> remover(html_content)
2121
'Title: Example, Body: <body><p>Hello World!</p></body>'
2222
2323
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
@@ -35,12 +35,9 @@ def cleanup_html(html_content: str, urls: list = []) -> str:
3535

3636
# Body Extraction (if it exists)
3737
body_content = soup.find('body')
38-
urls_content = ""
39-
if urls:
40-
urls_content = f", URLs in page: {urls}"
4138
if body_content:
4239
# Minify the HTML within the body tag
4340
minimized_body = minify(str(body_content))
44-
return "Title: " + title + ", Body: " + minimized_body + urls_content
41+
return "Title: " + title + ", Body: " + minimized_body
4542

46-
return "Title: " + title + ", Body: No body content found" + urls_content
43+
return "Title: " + title + ", Body: No body content found"

0 commit comments

Comments
 (0)