Skip to content

Commit cf3ab55

Browse files
committed
fix: search link node
1 parent d3e63d9 commit cf3ab55

File tree

1 file changed

+46
-37
lines changed

1 file changed

+46
-37
lines changed

scrapegraphai/nodes/search_link_node.py

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
# Imports from standard library
66
from typing import List, Optional
7+
import re
78
from tqdm import tqdm
89

910
# Imports from Langchain
@@ -20,7 +21,7 @@
2021
class SearchLinkNode(BaseNode):
2122
"""
2223
A node that can filter out the relevant links in the webpage content for the user prompt.
23-
Node expects the aleready scrapped links on the webpage and hence it is expected
24+
Node expects the already scrapped links on the webpage and hence it is expected
2425
that this node be used after the FetchNode.
2526
2627
Attributes:
@@ -74,32 +75,6 @@ def execute(self, state: dict) -> dict:
7475
parsed_content_chunks = state[input_keys[1]]
7576
output_parser = JsonOutputParser()
7677

77-
prompt_relevant_links = """
78-
You are a website scraper and you have just scraped the following content from a website.
79-
Content: {content}
80-
81-
You are now tasked with identifying all hyper links within the content that are potentially
82-
relevant to the user task: {user_prompt}
83-
84-
Assume relevance broadly, including any links that might be related or potentially useful
85-
in relation to the task.
86-
87-
Sort it in order of importance, the first one should be the most important one, the last one
88-
the least important
89-
90-
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
91-
whether the content at the link is directly relevant.
92-
93-
Output only a list of relevant links in the format:
94-
[
95-
"link1",
96-
"link2",
97-
"link3",
98-
.
99-
.
100-
.
101-
]
102-
"""
10378
relevant_links = []
10479

10580
for i, chunk in enumerate(
@@ -109,15 +84,49 @@ def execute(self, state: dict) -> dict:
10984
disable=not self.verbose,
11085
)
11186
):
112-
merge_prompt = PromptTemplate(
113-
template=prompt_relevant_links,
114-
input_variables=["content", "user_prompt"],
115-
)
116-
merge_chain = merge_prompt | self.llm_model | output_parser
117-
# merge_chain = merge_prompt | self.llm_model
118-
answer = merge_chain.invoke(
119-
{"content": chunk.page_content, "user_prompt": user_prompt}
120-
)
121-
relevant_links += answer
87+
try:
88+
# Primary approach: Regular expression to extract links
89+
links = re.findall(r'(https?://\S+)', chunk.page_content)
90+
relevant_links += links
91+
except Exception as e:
92+
# Fallback approach: Using the LLM to extract links
93+
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
94+
prompt_relevant_links = """
95+
You are a website scraper and you have just scraped the following content from a website.
96+
Content: {content}
97+
98+
You are now tasked with identifying all hyper links within the content that are potentially
99+
relevant to the user task: {user_prompt}
100+
101+
Assume relevance broadly, including any links that might be related or potentially useful
102+
in relation to the task.
103+
104+
Sort it in order of importance, the first one should be the most important one, the last one
105+
the least important
106+
107+
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
108+
whether the content at the link is directly relevant.
109+
110+
Output only a list of relevant links in the format:
111+
[
112+
"link1",
113+
"link2",
114+
"link3",
115+
.
116+
.
117+
.
118+
]
119+
"""
120+
121+
merge_prompt = PromptTemplate(
122+
template=prompt_relevant_links,
123+
input_variables=["content", "user_prompt"],
124+
)
125+
merge_chain = merge_prompt | self.llm_model | output_parser
126+
answer = merge_chain.invoke(
127+
{"content": chunk.page_content, "user_prompt": user_prompt}
128+
)
129+
relevant_links += answer
130+
122131
state.update({self.output[0]: relevant_links})
123132
return state

0 commit comments

Comments
 (0)