4
4
from typing import List , Optional
5
5
import re
6
6
from tqdm import tqdm
7
+ from urllib .parse import urlparse , parse_qs
7
8
from langchain .prompts import PromptTemplate
8
9
from langchain_core .output_parsers import JsonOutputParser
9
10
from langchain_core .runnables import RunnableParallel
10
11
from ..utils .logging import get_logger
11
12
from .base_node import BaseNode
12
13
from ..prompts import TEMPLATE_RELEVANT_LINKS
14
+ from ..helpers import default_filters
13
15
14
16
15
17
class SearchLinkNode (BaseNode ):
@@ -39,10 +41,54 @@ def __init__(
39
41
super ().__init__ (node_name , "node" , input , output , 1 , node_config )
40
42
41
43
self .llm_model = node_config ["llm_model" ]
42
- self .verbose = (
43
- False if node_config is None else node_config .get ("verbose" , False )
44
- )
45
44
45
+ # Apply filters if filter_links is True or if filter_config is provided
46
+ if node_config .get ("filter_links" , False ) or "filter_config" in node_config :
47
+ # Merge provided filter config with default filter config for partial configuration
48
+ provided_filter_config = node_config .get ("filter_config" , {})
49
+ self .filter_config = {** default_filters .filter_dict , ** provided_filter_config }
50
+ self .filter_links = True
51
+ else :
52
+ # Skip filtering if not enabled
53
+ self .filter_config = None
54
+ self .filter_links = False
55
+
56
+ self .verbose = node_config .get ("verbose" , False )
57
+ self .seen_links = set ()
58
+
59
+ def _is_same_domain (self , url , domain ):
60
+ if not self .filter_links or not self .filter_config .get ("diff_domain_filter" , True ):
61
+ return True # Skip the domain filter if not enabled
62
+ parsed_url = urlparse (url )
63
+ parsed_domain = urlparse (domain )
64
+ return parsed_url .netloc == parsed_domain .netloc
65
+
66
+ def _is_image_url (self , url ):
67
+ if not self .filter_links :
68
+ return False # Skip image filtering if filtering is not enabled
69
+
70
+ image_extensions = self .filter_config .get ("img_exts" , [])
71
+ return any (url .lower ().endswith (ext ) for ext in image_extensions )
72
+
73
+ def _is_language_url (self , url ):
74
+ if not self .filter_links :
75
+ return False # Skip language filtering if filtering is not enabled
76
+
77
+ lang_indicators = self .filter_config .get ("lang_indicators" , [])
78
+ parsed_url = urlparse (url )
79
+ query_params = parse_qs (parsed_url .query )
80
+
81
+ # Check if the URL path or query string indicates a language-specific version
82
+ return any (indicator in parsed_url .path .lower () or indicator in query_params for indicator in lang_indicators )
83
+
84
+ def _is_potentially_irrelevant (self , url ):
85
+ if not self .filter_links :
86
+ return False # Skip irrelevant URL filtering if filtering is not enabled
87
+
88
+ irrelevant_keywords = self .filter_config .get ("irrelevant_keywords" , [])
89
+ return any (keyword in url .lower () for keyword in irrelevant_keywords )
90
+
91
+
46
92
def execute (self , state : dict ) -> dict :
47
93
"""
48
94
Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also
@@ -64,6 +110,7 @@ def execute(self, state: dict) -> dict:
64
110
65
111
66
112
parsed_content_chunks = state .get ("doc" )
113
+ source_url = state .get ("url" ) or state .get ("local_dir" )
67
114
output_parser = JsonOutputParser ()
68
115
69
116
relevant_links = []
@@ -76,10 +123,28 @@ def execute(self, state: dict) -> dict:
76
123
)
77
124
):
78
125
try :
126
+
79
127
# Primary approach: Regular expression to extract links
80
128
links = re .findall (r'https?://[^\s"<>\]]+' , str (chunk .page_content ))
81
129
82
- relevant_links += links
130
+ if not self .filter_links :
131
+ links = list (set (links ))
132
+
133
+ relevant_links += links
134
+ self .seen_links .update (relevant_links )
135
+ else :
136
+ filtered_links = [
137
+ link for link in links
138
+ if self ._is_same_domain (link , source_url )
139
+ and not self ._is_image_url (link )
140
+ and not self ._is_language_url (link )
141
+ and not self ._is_potentially_irrelevant (link )
142
+ and link not in self .seen_links
143
+ ]
144
+ filtered_links = list (set (filtered_links ))
145
+ relevant_links += filtered_links
146
+ self .seen_links .update (relevant_links )
147
+
83
148
except Exception as e :
84
149
# Fallback approach: Using the LLM to extract links
85
150
self .logger .error (f"Error extracting links: { e } . Falling back to LLM." )
0 commit comments