Skip to content

Commit 08e9d9d

Browse files
committed
feat: Implemented a filter logic in search_link_node.py
feat: Added dict entry for Llama3.1:8b
1 parent 8d6c0b7 commit 08e9d9d

File tree

5 files changed

+135
-45
lines changed

5 files changed

+135
-45
lines changed

examples/local_models/search_link_graph_ollama.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,26 @@
99

1010
graph_config = {
1111
"llm": {
12-
"model": "ollama/llama3",
12+
"model": "ollama/llama3.1:8b",
1313
"temperature": 0,
1414
"format": "json", # Ollama needs the format to be specified explicitly
1515
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
1616
},
1717

1818
"verbose": True,
19-
"headless": False
19+
"headless": False,
20+
"filter_config": {
21+
"diff_domain_filter": True,
22+
# "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
23+
# "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
24+
# "irrelevant_keywords": [
25+
# '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
26+
# 'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/',
27+
# '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about',
28+
# '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help',
29+
# '.pdf', '.zip', '/news', '/files', '/downloads'
30+
# ]
31+
},
2032
}
2133

2234
# ************************************************

scrapegraphai/graphs/search_link_graph.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,9 @@ def _create_graph(self) -> BaseGraph:
7272
output=["parsed_doc"],
7373
node_config={
7474
"llm_model": self.llm_model,
75-
"chunk_size": self.model_token
75+
"chunk_size": self.model_token,
76+
"filter_links": self.config.get("filter_links", None),
77+
"filter_config": self.config.get("filter_config", None)
7678
}
7779
)
7880

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
"""
2+
Module for filtering irrelevant links
3+
"""
4+
5+
filter_dict = {
6+
"diff_domain_filter": True,
7+
"img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
8+
"lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
9+
"irrelevant_keywords": [
10+
'/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
11+
'linkedin.com', 'instagram.com', '.js', '.css',
12+
]
13+
}

scrapegraphai/helpers/models_tokens.py

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -51,44 +51,42 @@
5151
"gemini-1.5-pro-latest": 128000,
5252
"models/embedding-001": 2048
5353
},
54-
"ollama": {
55-
"grok-1": 8192,
56-
"command-r": 12800,
57-
"codellama": 16000,
58-
"dbrx": 32768,
59-
"deepseek-coder:33b": 16000,
60-
"falcon": 2048,
61-
"llama2": 4096,
62-
"llama3": 8192,
63-
"llama3:70b": 8192,
64-
"llama3.1":128000,
65-
"llama3.1:70b": 128000,
66-
"lama3.1:405b": 128000,
67-
"scrapegraph": 8192,
68-
"llava": 4096,
69-
"mixtral:8x22b-instruct": 65536,
70-
"mistral":8192,
71-
"mistral-openorca": 32000,
72-
"nomic-embed-text": 8192,
73-
"nous-hermes2:34b": 4096,
74-
"orca-mini": 2048,
75-
"phi3:3.8b": 12800,
76-
"qwen:0.5b": 32000,
77-
"qwen:1.8b": 32000,
78-
"qwen:4b": 32000,
79-
"qwen:14b": 32000,
80-
"qwen:32b": 32000,
81-
"qwen:72b": 32000,
82-
"qwen:110b": 32000,
83-
"stablelm-zephyr": 8192,
84-
"wizardlm2:8x22b": 65536,
85-
# embedding models
86-
"shaw/dmeta-embedding-zh-small-q4": 8192,
87-
"shaw/dmeta-embedding-zh-q4": 8192,
88-
"chevalblanc/acge_text_embedding": 8192,
89-
"martcreation/dmeta-embedding-zh": 8192,
90-
"snowflake-arctic-embed": 8192,
91-
"mxbai-embed-large": 512
54+
"ollama": { "command-r": 12800,
55+
"codellama": 16000,
56+
"dbrx": 32768,
57+
"deepseek-coder:33b": 16000,
58+
"falcon": 2048,
59+
"llama2": 4096,
60+
"llama3": 8192,
61+
"llama3:70b": 8192,
62+
"llama3.1":128000,
63+
"llama3.1:8b": 128000,
64+
"llama3.1:70b": 128000,
65+
"lama3.1:405b": 128000,
66+
"scrapegraph": 8192,
67+
"llava": 4096,
68+
"mixtral:8x22b-instruct": 65536,
69+
"mistral-openorca": 32000,
70+
"nomic-embed-text": 8192,
71+
"nous-hermes2:34b": 4096,
72+
"orca-mini": 2048,
73+
"phi3:3.8b": 12800,
74+
"qwen:0.5b": 32000,
75+
"qwen:1.8b": 32000,
76+
"qwen:4b": 32000,
77+
"qwen:14b": 32000,
78+
"qwen:32b": 32000,
79+
"qwen:72b": 32000,
80+
"qwen:110b": 32000,
81+
"stablelm-zephyr": 8192,
82+
"wizardlm2:8x22b": 65536,
83+
# embedding models
84+
"shaw/dmeta-embedding-zh-small-q4": 8192,
85+
"shaw/dmeta-embedding-zh-q4": 8192,
86+
"chevalblanc/acge_text_embedding": 8192,
87+
"martcreation/dmeta-embedding-zh": 8192,
88+
"snowflake-arctic-embed": 8192,
89+
"mxbai-embed-large": 512
9290
},
9391
"oneapi": {
9492
"qwen-turbo": 6000

scrapegraphai/nodes/search_link_node.py

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44
from typing import List, Optional
55
import re
66
from tqdm import tqdm
7+
from urllib.parse import urlparse, parse_qs
78
from langchain.prompts import PromptTemplate
89
from langchain_core.output_parsers import JsonOutputParser
910
from langchain_core.runnables import RunnableParallel
1011
from ..utils.logging import get_logger
1112
from .base_node import BaseNode
1213
from ..prompts import TEMPLATE_RELEVANT_LINKS
14+
from ..helpers import default_filters
1315

1416

1517
class SearchLinkNode(BaseNode):
@@ -39,10 +41,54 @@ def __init__(
3941
super().__init__(node_name, "node", input, output, 1, node_config)
4042

4143
self.llm_model = node_config["llm_model"]
42-
self.verbose = (
43-
False if node_config is None else node_config.get("verbose", False)
44-
)
4544

45+
# Apply filters if filter_links is True or if filter_config is provided
46+
if node_config.get("filter_links", False) or "filter_config" in node_config:
47+
# Merge provided filter config with default filter config for partial configuration
48+
provided_filter_config = node_config.get("filter_config", {})
49+
self.filter_config = {**default_filters.filter_dict, **provided_filter_config}
50+
self.filter_links = True
51+
else:
52+
# Skip filtering if not enabled
53+
self.filter_config = None
54+
self.filter_links = False
55+
56+
self.verbose = node_config.get("verbose", False)
57+
self.seen_links = set()
58+
59+
def _is_same_domain(self, url, domain):
60+
if not self.filter_links or not self.filter_config.get("diff_domain_filter", True):
61+
return True # Skip the domain filter if not enabled
62+
parsed_url = urlparse(url)
63+
parsed_domain = urlparse(domain)
64+
return parsed_url.netloc == parsed_domain.netloc
65+
66+
def _is_image_url(self, url):
67+
if not self.filter_links:
68+
return False # Skip image filtering if filtering is not enabled
69+
70+
image_extensions = self.filter_config.get("img_exts", [])
71+
return any(url.lower().endswith(ext) for ext in image_extensions)
72+
73+
def _is_language_url(self, url):
74+
if not self.filter_links:
75+
return False # Skip language filtering if filtering is not enabled
76+
77+
lang_indicators = self.filter_config.get("lang_indicators", [])
78+
parsed_url = urlparse(url)
79+
query_params = parse_qs(parsed_url.query)
80+
81+
# Check if the URL path or query string indicates a language-specific version
82+
return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators)
83+
84+
def _is_potentially_irrelevant(self, url):
85+
if not self.filter_links:
86+
return False # Skip irrelevant URL filtering if filtering is not enabled
87+
88+
irrelevant_keywords = self.filter_config.get("irrelevant_keywords", [])
89+
return any(keyword in url.lower() for keyword in irrelevant_keywords)
90+
91+
4692
def execute(self, state: dict) -> dict:
4793
"""
4894
Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also
@@ -64,6 +110,7 @@ def execute(self, state: dict) -> dict:
64110

65111

66112
parsed_content_chunks = state.get("doc")
113+
source_url = state.get("url") or state.get("local_dir")
67114
output_parser = JsonOutputParser()
68115

69116
relevant_links = []
@@ -76,10 +123,28 @@ def execute(self, state: dict) -> dict:
76123
)
77124
):
78125
try:
126+
79127
# Primary approach: Regular expression to extract links
80128
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
81129

82-
relevant_links += links
130+
if not self.filter_links:
131+
links = list(set(links))
132+
133+
relevant_links += links
134+
self.seen_links.update(relevant_links)
135+
else:
136+
filtered_links = [
137+
link for link in links
138+
if self._is_same_domain(link, source_url)
139+
and not self._is_image_url(link)
140+
and not self._is_language_url(link)
141+
and not self._is_potentially_irrelevant(link)
142+
and link not in self.seen_links
143+
]
144+
filtered_links = list(set(filtered_links))
145+
relevant_links += filtered_links
146+
self.seen_links.update(relevant_links)
147+
83148
except Exception as e:
84149
# Fallback approach: Using the LLM to extract links
85150
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")

0 commit comments

Comments
 (0)