@@ -50,35 +50,48 @@ def execute(self, state: dict) -> dict:
50
50
51
51
Args:
52
52
state (dict): The current state of the graph. The input keys will be used to fetch the
53
- correct data from the state.
53
+ correct data from the state.
54
54
55
55
Returns:
56
56
dict: The updated state with the output key containing the parsed content chunks.
57
57
58
58
Raises:
59
- KeyError: If the input keys are not found in the state.
59
+ KeyError: If the input keys are not found in the state, indicating that the
60
+ necessary information for parsing the content is missing.
60
61
"""
61
62
62
63
self .logger .info (f"--- Executing { self .node_name } Node ---" )
63
64
64
- # Fetch data using input keys
65
+ # Interpret input keys based on the provided input expression
65
66
input_keys = self .get_input_keys (state )
67
+
68
+ # Fetching data from the state based on the input keys
66
69
input_data = [state [key ] for key in input_keys ]
70
+ # Parse the document
67
71
docs_transformed = input_data [0 ]
68
-
69
- # Parse HTML if enabled
70
72
if self .parse_html :
71
73
docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
72
74
docs_transformed = docs_transformed [0 ]
73
75
74
- # Get text content
75
- text_content = docs_transformed . page_content if type ( docs_transformed ) == Document else docs_transformed
76
-
77
- # Chunk the text
78
- chunk_size = self . node_config . get ( "chunk_size" , 4096 ) - 250
79
- chunks = chunk ( text = text_content , chunk_size = chunk_size , token_counter = lambda x : len ( x . split ()), memoize = False )
76
+ chunks = chunk ( text = docs_transformed . page_content ,
77
+ chunk_size = self . node_config . get ( "chunk_size" , 4096 ),
78
+ token_counter = lambda x : len ( x ),
79
+ memoize = False )
80
+ else :
81
+ docs_transformed = docs_transformed [ 0 ]
80
82
81
- # Update state with chunks
83
+ if type (docs_transformed ) == Document :
84
+ chunks = chunk (text = docs_transformed .page_content ,
85
+ chunk_size = self .node_config .get ("chunk_size" , 4096 ),
86
+ token_counter = lambda x : len (x ),
87
+ memoize = False )
88
+ else :
89
+
90
+ chunks = chunk (text = docs_transformed ,
91
+ chunk_size = self .node_config .get ("chunk_size" , 4096 ),
92
+ token_counter = lambda x : len (x ),
93
+ memoize = False )
94
+
82
95
state .update ({self .output [0 ]: chunks })
83
96
84
- return state
97
+ return state
0 commit comments