Skip to content

feat: Implemented a filter logic in search_link_node.py and added a dict entry for the Llama3.1:8b #561

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions examples/local_models/search_link_graph_ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,26 @@

graph_config = {
"llm": {
"model": "ollama/llama3",
"model": "ollama/llama3.1:8b",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},

"verbose": True,
"headless": False
"headless": False,
"filter_config": {
"diff_domain_filter": True,
# "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
# "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
# "irrelevant_keywords": [
# '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
# 'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/',
# '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about',
# '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help',
# '.pdf', '.zip', '/news', '/files', '/downloads'
# ]
},
}

# ************************************************
Expand Down
4 changes: 3 additions & 1 deletion scrapegraphai/graphs/search_link_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ def _create_graph(self) -> BaseGraph:
output=["parsed_doc"],
node_config={
"llm_model": self.llm_model,
"chunk_size": self.model_token
"chunk_size": self.model_token,
"filter_links": self.config.get("filter_links", None),
"filter_config": self.config.get("filter_config", None)
}
)

Expand Down
13 changes: 13 additions & 0 deletions scrapegraphai/helpers/default_filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""
Module for filtering irrelevant links
"""

filter_dict = {
"diff_domain_filter": True,
"img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
"lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
"irrelevant_keywords": [
'/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
'linkedin.com', 'instagram.com', '.js', '.css',
]
}
74 changes: 36 additions & 38 deletions scrapegraphai/helpers/models_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,44 +51,42 @@
"gemini-1.5-pro-latest": 128000,
"models/embedding-001": 2048
},
"ollama": {
"grok-1": 8192,
"command-r": 12800,
"codellama": 16000,
"dbrx": 32768,
"deepseek-coder:33b": 16000,
"falcon": 2048,
"llama2": 4096,
"llama3": 8192,
"llama3:70b": 8192,
"llama3.1":128000,
"llama3.1:70b": 128000,
"lama3.1:405b": 128000,
"scrapegraph": 8192,
"llava": 4096,
"mixtral:8x22b-instruct": 65536,
"mistral":8192,
"mistral-openorca": 32000,
"nomic-embed-text": 8192,
"nous-hermes2:34b": 4096,
"orca-mini": 2048,
"phi3:3.8b": 12800,
"qwen:0.5b": 32000,
"qwen:1.8b": 32000,
"qwen:4b": 32000,
"qwen:14b": 32000,
"qwen:32b": 32000,
"qwen:72b": 32000,
"qwen:110b": 32000,
"stablelm-zephyr": 8192,
"wizardlm2:8x22b": 65536,
# embedding models
"shaw/dmeta-embedding-zh-small-q4": 8192,
"shaw/dmeta-embedding-zh-q4": 8192,
"chevalblanc/acge_text_embedding": 8192,
"martcreation/dmeta-embedding-zh": 8192,
"snowflake-arctic-embed": 8192,
"mxbai-embed-large": 512
"ollama": { "command-r": 12800,
"codellama": 16000,
"dbrx": 32768,
"deepseek-coder:33b": 16000,
"falcon": 2048,
"llama2": 4096,
"llama3": 8192,
"llama3:70b": 8192,
"llama3.1":128000,
"llama3.1:8b": 128000,
"llama3.1:70b": 128000,
"lama3.1:405b": 128000,
"scrapegraph": 8192,
"llava": 4096,
"mixtral:8x22b-instruct": 65536,
"mistral-openorca": 32000,
"nomic-embed-text": 8192,
"nous-hermes2:34b": 4096,
"orca-mini": 2048,
"phi3:3.8b": 12800,
"qwen:0.5b": 32000,
"qwen:1.8b": 32000,
"qwen:4b": 32000,
"qwen:14b": 32000,
"qwen:32b": 32000,
"qwen:72b": 32000,
"qwen:110b": 32000,
"stablelm-zephyr": 8192,
"wizardlm2:8x22b": 65536,
# embedding models
"shaw/dmeta-embedding-zh-small-q4": 8192,
"shaw/dmeta-embedding-zh-q4": 8192,
"chevalblanc/acge_text_embedding": 8192,
"martcreation/dmeta-embedding-zh": 8192,
"snowflake-arctic-embed": 8192,
"mxbai-embed-large": 512
},
"oneapi": {
"qwen-turbo": 6000
Expand Down
73 changes: 69 additions & 4 deletions scrapegraphai/nodes/search_link_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
from typing import List, Optional
import re
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnableParallel
from ..utils.logging import get_logger
from .base_node import BaseNode
from ..prompts import TEMPLATE_RELEVANT_LINKS
from ..helpers import default_filters


class SearchLinkNode(BaseNode):
Expand Down Expand Up @@ -39,10 +41,54 @@ def __init__(
super().__init__(node_name, "node", input, output, 1, node_config)

self.llm_model = node_config["llm_model"]
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)

# Apply filters if filter_links is True or if filter_config is provided
if node_config.get("filter_links", False) or "filter_config" in node_config:
# Merge provided filter config with default filter config for partial configuration
provided_filter_config = node_config.get("filter_config", {})
self.filter_config = {**default_filters.filter_dict, **provided_filter_config}
self.filter_links = True
else:
# Skip filtering if not enabled
self.filter_config = None
self.filter_links = False

self.verbose = node_config.get("verbose", False)
self.seen_links = set()

def _is_same_domain(self, url, domain):
if not self.filter_links or not self.filter_config.get("diff_domain_filter", True):
return True # Skip the domain filter if not enabled
parsed_url = urlparse(url)
parsed_domain = urlparse(domain)
return parsed_url.netloc == parsed_domain.netloc

def _is_image_url(self, url):
if not self.filter_links:
return False # Skip image filtering if filtering is not enabled

image_extensions = self.filter_config.get("img_exts", [])
return any(url.lower().endswith(ext) for ext in image_extensions)

def _is_language_url(self, url):
if not self.filter_links:
return False # Skip language filtering if filtering is not enabled

lang_indicators = self.filter_config.get("lang_indicators", [])
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)

# Check if the URL path or query string indicates a language-specific version
return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators)

def _is_potentially_irrelevant(self, url):
if not self.filter_links:
return False # Skip irrelevant URL filtering if filtering is not enabled

irrelevant_keywords = self.filter_config.get("irrelevant_keywords", [])
return any(keyword in url.lower() for keyword in irrelevant_keywords)


def execute(self, state: dict) -> dict:
"""
Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also
Expand All @@ -64,6 +110,7 @@ def execute(self, state: dict) -> dict:


parsed_content_chunks = state.get("doc")
source_url = state.get("url") or state.get("local_dir")
output_parser = JsonOutputParser()

relevant_links = []
Expand All @@ -76,10 +123,28 @@ def execute(self, state: dict) -> dict:
)
):
try:

# Primary approach: Regular expression to extract links
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))

relevant_links += links
if not self.filter_links:
links = list(set(links))

relevant_links += links
self.seen_links.update(relevant_links)
else:
filtered_links = [
link for link in links
if self._is_same_domain(link, source_url)
and not self._is_image_url(link)
and not self._is_language_url(link)
and not self._is_potentially_irrelevant(link)
and link not in self.seen_links
]
filtered_links = list(set(filtered_links))
relevant_links += filtered_links
self.seen_links.update(relevant_links)

except Exception as e:
# Fallback approach: Using the LLM to extract links
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
Expand Down