1
1
"""
2
2
ParseNode Module
3
3
"""
4
- from typing import List , Optional
4
+ from typing import Tuple , List , Optional
5
+ from urllib .parse import urljoin
5
6
from semchunk import chunk
6
7
from langchain_community .document_transformers import Html2TextTransformer
7
8
from langchain_core .documents import Document
8
9
from .base_node import BaseNode
10
+ from ..helpers import default_filters
11
+
12
+ import re
9
13
10
14
class ParseNode (BaseNode ):
11
15
"""
@@ -41,6 +45,66 @@ def __init__(
41
45
True if node_config is None else node_config .get ("parse_html" , True )
42
46
)
43
47
self .llm_model = node_config ['llm_model' ]
48
+ self .parse_urls = (
49
+ False if node_config is None else node_config .get ("parse_urls" , False )
50
+ )
51
+
52
+ def _clean_urls (self , urls : List [str ]) -> List [str ]:
53
+ """
54
+ Cleans the URLs extracted from the text.
55
+
56
+ Args:
57
+ urls (List[str]): The list of URLs to clean.
58
+
59
+ Returns:
60
+ List[str]: The cleaned URLs.
61
+ """
62
+ cleaned_urls = []
63
+ for url in urls :
64
+ # Remove any leading 'thumbnail](' or similar patterns
65
+ url = re .sub (r'.*?\]\(' , '' , url )
66
+
67
+ # Remove any trailing parentheses or brackets
68
+ url = url .rstrip (').' )
69
+
70
+ cleaned_urls .append (url )
71
+
72
+ return cleaned_urls
73
+
74
+ def extract_urls (self , text : str , source : str ) -> Tuple [List [str ], List [str ]]:
75
+ """
76
+ Extracts URLs from the given text.
77
+
78
+ Args:
79
+ text (str): The text to extract URLs from.
80
+
81
+ Returns:
82
+ Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
83
+ """
84
+ # Return empty lists if the URLs are not to be parsed
85
+ if not self .parse_urls :
86
+ return [], []
87
+
88
+ # Regular expression to find URLs (both links and images)
89
+ image_extensions = default_filters .filter_dict ["img_exts" ]
90
+ image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
91
+ url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
92
+
93
+ # Find all URLs in the string
94
+ all_urls = url_pattern .findall (text )
95
+ all_urls = self ._clean_urls (all_urls )
96
+
97
+ if not source .startswith ("http" ):
98
+ # Remove any URLs that is not complete
99
+ all_urls = [url for url in all_urls if url .startswith ("http" )]
100
+ else :
101
+ # Add to local URLs the source URL
102
+ all_urls = [urljoin (source , url ) for url in all_urls ]
103
+
104
+ images = [url for url in all_urls if any (url .endswith (ext ) for ext in image_extensions )]
105
+ links = [url for url in all_urls if url not in images ]
106
+
107
+ return links , images
44
108
45
109
def execute (self , state : dict ) -> dict :
46
110
"""
@@ -63,7 +127,9 @@ def execute(self, state: dict) -> dict:
63
127
input_keys = self .get_input_keys (state )
64
128
65
129
input_data = [state [key ] for key in input_keys ]
130
+
66
131
docs_transformed = input_data [0 ]
132
+ source = input_data [1 ] if self .parse_urls else None
67
133
68
134
def count_tokens (text ):
69
135
from ..utils import token_count
@@ -73,12 +139,17 @@ def count_tokens(text):
73
139
docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
74
140
docs_transformed = docs_transformed [0 ]
75
141
142
+ link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
143
+
76
144
chunks = chunk (text = docs_transformed .page_content ,
77
145
chunk_size = self .node_config .get ("chunk_size" , 4096 )- 250 ,
78
146
token_counter = count_tokens ,
79
147
memoize = False )
80
148
else :
81
149
docs_transformed = docs_transformed [0 ]
150
+
151
+ link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
152
+
82
153
chunk_size = self .node_config .get ("chunk_size" , 4096 )
83
154
chunk_size = min (chunk_size - 500 , int (chunk_size * 0.9 ))
84
155
@@ -94,4 +165,8 @@ def count_tokens(text):
94
165
memoize = False )
95
166
96
167
state .update ({self .output [0 ]: chunks })
168
+ if self .parse_urls :
169
+ state .update ({self .output [1 ]: link_urls })
170
+ state .update ({self .output [2 ]: img_urls })
171
+
97
172
return state
0 commit comments