3
3
"""
4
4
from typing import Tuple , List , Optional
5
5
from urllib .parse import urljoin
6
+ import re
6
7
from semchunk import chunk
7
8
from langchain_community .document_transformers import Html2TextTransformer
8
9
from langchain_core .documents import Document
9
10
from .base_node import BaseNode
10
11
from ..helpers import default_filters
11
12
12
- import re
13
-
14
13
class ParseNode (BaseNode ):
15
14
"""
16
15
A node responsible for parsing HTML content from a document.
@@ -61,14 +60,12 @@ def _clean_urls(self, urls: List[str]) -> List[str]:
61
60
"""
62
61
cleaned_urls = []
63
62
for url in urls :
64
- # Remove any leading 'thumbnail](' or similar patterns
65
63
url = re .sub (r'.*?\]\(' , '' , url )
66
-
67
- # Remove any trailing parentheses or brackets
64
+
68
65
url = url .rstrip (').' )
69
-
66
+
70
67
cleaned_urls .append (url )
71
-
68
+
72
69
return cleaned_urls
73
70
74
71
def extract_urls (self , text : str , source : str ) -> Tuple [List [str ], List [str ]]:
@@ -81,26 +78,21 @@ def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
81
78
Returns:
82
79
Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
83
80
"""
84
- # Return empty lists if the URLs are not to be parsed
85
81
if not self .parse_urls :
86
82
return [], []
87
-
88
- # Regular expression to find URLs (both links and images)
83
+
89
84
image_extensions = default_filters .filter_dict ["img_exts" ]
90
85
image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
91
86
url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
92
87
93
- # Find all URLs in the string
94
88
all_urls = url_pattern .findall (text )
95
89
all_urls = self ._clean_urls (all_urls )
96
90
97
91
if not source .startswith ("http" ):
98
- # Remove any URLs that is not complete
99
92
all_urls = [url for url in all_urls if url .startswith ("http" )]
100
93
else :
101
- # Add to local URLs the source URL
102
94
all_urls = [urljoin (source , url ) for url in all_urls ]
103
-
95
+
104
96
images = [url for url in all_urls if any (url .endswith (ext ) for ext in image_extensions )]
105
97
links = [url for url in all_urls if url not in images ]
106
98
@@ -136,7 +128,7 @@ def count_tokens(text):
136
128
return token_count (text , self .llm_model .model_name )
137
129
138
130
if self .parse_html :
139
- docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
131
+ docs_transformed = Html2TextTransformer (ignore_links = False ).transform_documents (input_data [0 ])
140
132
docs_transformed = docs_transformed [0 ]
141
133
142
134
link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
0 commit comments