8
8
from langchain_core .documents import Document
9
9
from langchain_community .document_loaders import PyPDFLoader
10
10
from .base_node import BaseNode
11
- from ..utils .cleanup_html import cleanup_html
12
- import requests
13
- from bs4 import BeautifulSoup
11
+ from ..utils .remover import remover
14
12
15
13
16
14
class FetchNode (BaseNode ):
@@ -36,7 +34,6 @@ class FetchNode(BaseNode):
36
34
def __init__ (self , input : str , output : List [str ], node_config : Optional [dict ] = None , node_name : str = "Fetch" ):
37
35
super ().__init__ (node_name , "node" , input , output , 1 )
38
36
39
-
40
37
self .headless = True if node_config is None else node_config .get (
41
38
"headless" , True )
42
39
self .verbose = False if node_config is None else node_config .get (
@@ -97,22 +94,10 @@ def execute(self, state):
97
94
pass
98
95
99
96
elif not source .startswith ("http" ):
100
- compressed_document = [Document (page_content = cleanup_html (source ), metadata = {
97
+ compressed_document = [Document (page_content = remover (source ), metadata = {
101
98
"source" : "local_dir"
102
99
})]
103
100
104
- elif self .useSoup :
105
- response = requests .get (source )
106
- if response .status_code == 200 :
107
- soup = BeautifulSoup (response .text , 'html.parser' )
108
- links = soup .find_all ('a' )
109
- link_urls = []
110
- for link in links :
111
- if 'href' in link .attrs :
112
- link_urls .append (link ['href' ])
113
- compressed_document = [Document (page_content = cleanup_html (soup .prettify (), link_urls ))]
114
- else :
115
- print (f"Failed to retrieve contents from the webpage at url: { url } " )
116
101
else :
117
102
if self .node_config is not None and self .node_config .get ("endpoint" ) is not None :
118
103
@@ -129,7 +114,7 @@ def execute(self, state):
129
114
130
115
document = loader .load ()
131
116
compressed_document = [
132
- Document (page_content = cleanup_html (str (document [0 ].page_content )))]
117
+ Document (page_content = remover (str (document [0 ].page_content )))]
133
118
134
119
state .update ({self .output [0 ]: compressed_document })
135
- return state
120
+ return state
0 commit comments