@@ -121,7 +121,7 @@ def execute(self, state):
121
121
"xml" : self .handle_file ,
122
122
"md" : self .handle_file ,
123
123
}
124
-
124
+
125
125
if input_type in handlers :
126
126
return handlers [input_type ](state , input_type , source )
127
127
elif self .input == "pdf_dir" :
@@ -130,7 +130,7 @@ def execute(self, state):
130
130
return self .handle_local_source (state , source )
131
131
else :
132
132
return self .handle_web_source (state , source )
133
-
133
+
134
134
def handle_directory (self , state , input_type , source ):
135
135
"""
136
136
Handles the directory by compressing the source document and updating the state.
@@ -143,7 +143,7 @@ def handle_directory(self, state, input_type, source):
143
143
Returns:
144
144
dict: The updated state with the compressed document.
145
145
"""
146
-
146
+
147
147
compressed_document = [
148
148
source
149
149
]
@@ -169,11 +169,11 @@ def handle_file(self, state, input_type, source):
169
169
- "xml": Reads the content of an XML file as a string.
170
170
- "md": Reads the content of a Markdown file as a string.
171
171
"""
172
-
172
+
173
173
compressed_document = self .load_file_content (source , input_type )
174
-
174
+
175
175
return self .update_state (state , compressed_document )
176
-
176
+
177
177
def load_file_content (self , source , input_type ):
178
178
"""
179
179
Loads the content of a file based on its input type.
@@ -185,7 +185,7 @@ def load_file_content(self, source, input_type):
185
185
Returns:
186
186
list: A list containing a Document object with the loaded content and metadata.
187
187
"""
188
-
188
+
189
189
if input_type == "pdf" :
190
190
loader = PyPDFLoader (source )
191
191
return loader .load ()
@@ -198,7 +198,7 @@ def load_file_content(self, source, input_type):
198
198
with open (source , "r" , encoding = "utf-8" ) as f :
199
199
data = f .read ()
200
200
return [Document (page_content = data , metadata = {"source" : input_type })]
201
-
201
+
202
202
def handle_local_source (self , state , source ):
203
203
"""
204
204
Handles the local source by fetching HTML content, optionally converting it to Markdown,
@@ -214,11 +214,11 @@ def handle_local_source(self, state, source):
214
214
Raises:
215
215
ValueError: If the source is empty or contains only whitespace.
216
216
"""
217
-
217
+
218
218
self .logger .info (f"--- (Fetching HTML from: { source } ) ---" )
219
219
if not source .strip ():
220
220
raise ValueError ("No HTML body content found in the local source." )
221
-
221
+
222
222
parsed_content = source
223
223
224
224
if isinstance (self .llm_model , ChatOpenAI ) and not self .script_creator or self .force and not self .script_creator :
@@ -229,13 +229,13 @@ def handle_local_source(self, state, source):
229
229
compressed_document = [
230
230
Document (page_content = parsed_content , metadata = {"source" : "local_dir" })
231
231
]
232
-
232
+
233
233
return self .update_state (state , compressed_document )
234
-
234
+
235
235
def handle_web_source (self , state , source ):
236
236
"""
237
- Handles the web source by fetching HTML content from a URL, optionally converting it to Markdown,
238
- and updating the state.
237
+ Handles the web source by fetching HTML content from a URL,
238
+ optionally converting it to Markdown, and updating the state.
239
239
240
240
Parameters:
241
241
state (dict): The current state of the graph.
@@ -247,7 +247,7 @@ def handle_web_source(self, state, source):
247
247
Raises:
248
248
ValueError: If the fetched HTML content is empty or contains only whitespace.
249
249
"""
250
-
250
+
251
251
self .logger .info (f"--- (Fetching HTML from: { source } ) ---" )
252
252
if self .use_soup :
253
253
response = requests .get (source )
0 commit comments