1
1
"""
2
2
ParseNode Module
3
3
"""
4
- from typing import Tuple , List , Optional
5
- from urllib .parse import urljoin
6
- import re
4
+ from typing import List , Optional
7
5
from semchunk import chunk
8
6
from langchain_community .document_transformers import Html2TextTransformer
9
7
from langchain_core .documents import Document
10
8
from .base_node import BaseNode
11
- from ..helpers import default_filters
12
9
13
10
class ParseNode (BaseNode ):
14
11
"""
@@ -43,60 +40,6 @@ def __init__(
43
40
self .parse_html = (
44
41
True if node_config is None else node_config .get ("parse_html" , True )
45
42
)
46
- self .llm_model = node_config ['llm_model' ]
47
- self .parse_urls = (
48
- False if node_config is None else node_config .get ("parse_urls" , False )
49
- )
50
-
51
- def _clean_urls (self , urls : List [str ]) -> List [str ]:
52
- """
53
- Cleans the URLs extracted from the text.
54
-
55
- Args:
56
- urls (List[str]): The list of URLs to clean.
57
-
58
- Returns:
59
- List[str]: The cleaned URLs.
60
- """
61
- cleaned_urls = []
62
- for url in urls :
63
- url = re .sub (r'.*?\]\(' , '' , url )
64
-
65
- url = url .rstrip (').' )
66
-
67
- cleaned_urls .append (url )
68
-
69
- return cleaned_urls
70
-
71
- def extract_urls (self , text : str , source : str ) -> Tuple [List [str ], List [str ]]:
72
- """
73
- Extracts URLs from the given text.
74
-
75
- Args:
76
- text (str): The text to extract URLs from.
77
-
78
- Returns:
79
- Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
80
- """
81
- if not self .parse_urls :
82
- return [], []
83
-
84
- image_extensions = default_filters .filter_dict ["img_exts" ]
85
- image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
86
- url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
87
-
88
- all_urls = url_pattern .findall (text )
89
- all_urls = self ._clean_urls (all_urls )
90
-
91
- if not source .startswith ("http" ):
92
- all_urls = [url for url in all_urls if url .startswith ("http" )]
93
- else :
94
- all_urls = [urljoin (source , url ) for url in all_urls ]
95
-
96
- images = [url for url in all_urls if any (url .endswith (ext ) for ext in image_extensions )]
97
- links = [url for url in all_urls if url not in images ]
98
-
99
- return links , images
100
43
101
44
def execute (self , state : dict ) -> dict :
102
45
"""
@@ -119,46 +62,33 @@ def execute(self, state: dict) -> dict:
119
62
input_keys = self .get_input_keys (state )
120
63
121
64
input_data = [state [key ] for key in input_keys ]
122
-
123
65
docs_transformed = input_data [0 ]
124
- source = input_data [1 ] if self .parse_urls else None
125
-
126
- def count_tokens (text ):
127
- from ..utils import token_count
128
- return token_count (text , self .llm_model .model_name )
129
66
130
67
if self .parse_html :
131
- docs_transformed = Html2TextTransformer (ignore_links = False ).transform_documents (input_data [0 ])
68
+ docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
132
69
docs_transformed = docs_transformed [0 ]
133
70
134
- link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
135
-
136
71
chunks = chunk (text = docs_transformed .page_content ,
137
72
chunk_size = self .node_config .get ("chunk_size" , 4096 )- 250 ,
138
- token_counter = count_tokens ,
73
+ token_counter = lambda text : len ( text . split ()) ,
139
74
memoize = False )
140
75
else :
141
76
docs_transformed = docs_transformed [0 ]
142
77
143
- link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
144
-
145
78
chunk_size = self .node_config .get ("chunk_size" , 4096 )
146
79
chunk_size = min (chunk_size - 500 , int (chunk_size * 0.9 ))
147
80
148
81
if isinstance (docs_transformed , Document ):
149
82
chunks = chunk (text = docs_transformed .page_content ,
150
83
chunk_size = chunk_size ,
151
- token_counter = count_tokens ,
84
+ token_counter = lambda text : len ( text . split ()) ,
152
85
memoize = False )
153
86
else :
154
87
chunks = chunk (text = docs_transformed ,
155
88
chunk_size = chunk_size ,
156
- token_counter = count_tokens ,
89
+ token_counter = lambda text : len ( text . split ()) ,
157
90
memoize = False )
158
91
159
92
state .update ({self .output [0 ]: chunks })
160
- if self .parse_urls :
161
- state .update ({self .output [1 ]: link_urls })
162
- state .update ({self .output [2 ]: img_urls })
163
93
164
94
return state
0 commit comments