6
6
from langchain_community .document_loaders import AsyncChromiumLoader
7
7
from langchain_core .documents import Document
8
8
from .base_node import BaseNode
9
- from ..utils .remover import remover
9
+ from ..utils .cleanup_html import cleanup_html
10
+ import requests
11
+ from bs4 import BeautifulSoup
10
12
11
13
12
14
class FetchNode (BaseNode ):
@@ -32,6 +34,7 @@ class FetchNode(BaseNode):
32
34
def __init__ (self , input : str , output : List [str ], node_config : Optional [dict ]= None , node_name : str = "Fetch" ):
33
35
super ().__init__ (node_name , "node" , input , output , 1 )
34
36
37
+ self .useSoup = True if node_config is None else node_config .get ("useSoup" , True )
35
38
self .headless = True if node_config is None else node_config .get ("headless" , True )
36
39
self .verbose = False if node_config is None else node_config .get ("verbose" , False )
37
40
@@ -67,10 +70,22 @@ def execute(self, state):
67
70
})]
68
71
# if it is a local directory
69
72
elif not source .startswith ("http" ):
70
- compressed_document = [Document (page_content = remover (source ), metadata = {
73
+ compressed_document = [Document (page_content = cleanup_html (source ), metadata = {
71
74
"source" : "local_dir"
72
75
})]
73
76
77
+ elif self .useSoup :
78
+ response = requests .get (source )
79
+ if response .status_code == 200 :
80
+ soup = BeautifulSoup (response .text , 'html.parser' )
81
+ links = soup .find_all ('a' )
82
+ link_urls = []
83
+ for link in links :
84
+ if 'href' in link .attrs :
85
+ link_urls .append (link ['href' ])
86
+ compressed_document = [Document (page_content = cleanup_html (soup .prettify (), link_urls ))]
87
+ else :
88
+ print (f"Failed to retrieve contents from the webpage at url: { url } " )
74
89
else :
75
90
if self .node_config is not None and self .node_config .get ("endpoint" ) is not None :
76
91
@@ -87,7 +102,7 @@ def execute(self, state):
87
102
88
103
document = loader .load ()
89
104
compressed_document = [
90
- Document (page_content = remover (str (document [0 ].page_content )))]
105
+ Document (page_content = cleanup_html (str (document [0 ].page_content )))]
91
106
92
107
state .update ({self .output [0 ]: compressed_document })
93
108
return state
0 commit comments