4
4
5
5
# Imports from standard library
6
6
from typing import List , Optional
7
+ import re
7
8
from tqdm import tqdm
8
9
9
10
# Imports from Langchain
20
21
class SearchLinkNode (BaseNode ):
21
22
"""
22
23
A node that can filter out the relevant links in the webpage content for the user prompt.
23
- Node expects the aleready scrapped links on the webpage and hence it is expected
24
+ Node expects the already scrapped links on the webpage and hence it is expected
24
25
that this node be used after the FetchNode.
25
26
26
27
Attributes:
@@ -74,32 +75,6 @@ def execute(self, state: dict) -> dict:
74
75
parsed_content_chunks = state [input_keys [1 ]]
75
76
output_parser = JsonOutputParser ()
76
77
77
- prompt_relevant_links = """
78
- You are a website scraper and you have just scraped the following content from a website.
79
- Content: {content}
80
-
81
- You are now tasked with identifying all hyper links within the content that are potentially
82
- relevant to the user task: {user_prompt}
83
-
84
- Assume relevance broadly, including any links that might be related or potentially useful
85
- in relation to the task.
86
-
87
- Sort it in order of importance, the first one should be the most important one, the last one
88
- the least important
89
-
90
- Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
91
- whether the content at the link is directly relevant.
92
-
93
- Output only a list of relevant links in the format:
94
- [
95
- "link1",
96
- "link2",
97
- "link3",
98
- .
99
- .
100
- .
101
- ]
102
- """
103
78
relevant_links = []
104
79
105
80
for i , chunk in enumerate (
@@ -109,15 +84,49 @@ def execute(self, state: dict) -> dict:
109
84
disable = not self .verbose ,
110
85
)
111
86
):
112
- merge_prompt = PromptTemplate (
113
- template = prompt_relevant_links ,
114
- input_variables = ["content" , "user_prompt" ],
115
- )
116
- merge_chain = merge_prompt | self .llm_model | output_parser
117
- # merge_chain = merge_prompt | self.llm_model
118
- answer = merge_chain .invoke (
119
- {"content" : chunk .page_content , "user_prompt" : user_prompt }
120
- )
121
- relevant_links += answer
87
+ try :
88
+ # Primary approach: Regular expression to extract links
89
+ links = re .findall (r'(https?://\S+)' , chunk .page_content )
90
+ relevant_links += links
91
+ except Exception as e :
92
+ # Fallback approach: Using the LLM to extract links
93
+ self .logger .error (f"Error extracting links: { e } . Falling back to LLM." )
94
+ prompt_relevant_links = """
95
+ You are a website scraper and you have just scraped the following content from a website.
96
+ Content: {content}
97
+
98
+ You are now tasked with identifying all hyper links within the content that are potentially
99
+ relevant to the user task: {user_prompt}
100
+
101
+ Assume relevance broadly, including any links that might be related or potentially useful
102
+ in relation to the task.
103
+
104
+ Sort it in order of importance, the first one should be the most important one, the last one
105
+ the least important
106
+
107
+ Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
108
+ whether the content at the link is directly relevant.
109
+
110
+ Output only a list of relevant links in the format:
111
+ [
112
+ "link1",
113
+ "link2",
114
+ "link3",
115
+ .
116
+ .
117
+ .
118
+ ]
119
+ """
120
+
121
+ merge_prompt = PromptTemplate (
122
+ template = prompt_relevant_links ,
123
+ input_variables = ["content" , "user_prompt" ],
124
+ )
125
+ merge_chain = merge_prompt | self .llm_model | output_parser
126
+ answer = merge_chain .invoke (
127
+ {"content" : chunk .page_content , "user_prompt" : user_prompt }
128
+ )
129
+ relevant_links += answer
130
+
122
131
state .update ({self .output [0 ]: relevant_links })
123
132
return state
0 commit comments