3
3
"""
4
4
import pandas as pd
5
5
import json
6
+ import requests
6
7
from typing import List , Optional
7
8
from langchain_community .document_loaders import AsyncChromiumLoader
8
9
from langchain_core .documents import Document
9
10
from langchain_community .document_loaders import PyPDFLoader
10
11
from .base_node import BaseNode
11
- from ..utils .remover import remover
12
+ from ..utils .cleanup_html import cleanup_html
12
13
13
14
14
15
class FetchNode (BaseNode ):
@@ -38,6 +39,8 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict] =
38
39
"headless" , True )
39
40
self .verbose = False if node_config is None else node_config .get (
40
41
"verbose" , False )
42
+ self .useSoup = True if node_config is None else node_config .get (
43
+ "useSoup" , True )
41
44
42
45
def execute (self , state ):
43
46
"""
@@ -94,9 +97,17 @@ def execute(self, state):
94
97
pass
95
98
96
99
elif not source .startswith ("http" ):
97
- compressed_document = [Document (page_content = remover (source ), metadata = {
100
+ compressed_document = [Document (page_content = cleanup_html (source ), metadata = {
98
101
"source" : "local_dir"
99
102
})]
103
+
104
+ elif self .useSoup :
105
+ response = requests .get (source )
106
+ if response .status_code == 200 :
107
+ cleanedup_html = cleanup_html (response .text , source )
108
+ compressed_document = [Document (page_content = cleanedup_html )]
109
+ else :
110
+ print (f"Failed to retrieve contents from the webpage at url: { url } " )
100
111
101
112
else :
102
113
if self .node_config is not None and self .node_config .get ("endpoint" ) is not None :
@@ -114,7 +125,7 @@ def execute(self, state):
114
125
115
126
document = loader .load ()
116
127
compressed_document = [
117
- Document (page_content = remover (str (document [0 ].page_content )))]
128
+ Document (page_content = cleanup_html (str (document [0 ].page_content )))]
118
129
119
130
state .update ({self .output [0 ]: compressed_document })
120
131
return state
0 commit comments