ScrapeGraphAI
diff --git a/‎CHANGELOG.md
Lines changed: 14 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 14 additions & 0 deletions
diff --git a/‎examples/local_models/search_link_graph_ollama.py
Lines changed: 14 additions & 2 deletions b/‎examples/local_models/search_link_graph_ollama.py
Lines changed: 14 additions & 2 deletions
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎scrapegraphai/graphs/search_link_graph.py
Lines changed: 3 additions & 1 deletion b/‎scrapegraphai/graphs/search_link_graph.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎scrapegraphai/helpers/default_filters.py
Lines changed: 13 additions & 0 deletions b/‎scrapegraphai/helpers/default_filters.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎scrapegraphai/helpers/models_tokens.py
Lines changed: 40 additions & 38 deletions b/‎scrapegraphai/helpers/models_tokens.py
Lines changed: 40 additions & 38 deletions
diff --git a/‎scrapegraphai/nodes/base_node.py
Lines changed: 2 additions & 2 deletions b/‎scrapegraphai/nodes/base_node.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎scrapegraphai/nodes/fetch_node.py
Lines changed: 1 addition & 6 deletions b/‎scrapegraphai/nodes/fetch_node.py
Lines changed: 1 addition & 6 deletions
diff --git a/‎scrapegraphai/nodes/generate_answer_csv_node.py
Lines changed: 10 additions & 12 deletions b/‎scrapegraphai/nodes/generate_answer_csv_node.py
Lines changed: 10 additions & 12 deletions
diff --git a/‎scrapegraphai/nodes/generate_answer_node.py
Lines changed: 14 additions & 14 deletions b/‎scrapegraphai/nodes/generate_answer_node.py
Lines changed: 14 additions & 14 deletions
diff --git a/‎scrapegraphai/nodes/generate_answer_omni_node.py
Lines changed: 10 additions & 11 deletions b/‎scrapegraphai/nodes/generate_answer_omni_node.py
Lines changed: 10 additions & 11 deletions
@@ -1,3 +1,17 @@
+## [1.14.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.14.0-beta.9...v1.14.0-beta.10) (2024-08-19)
+
+
+### Features
+
+* Implemented a filter logic in search_link_node.py ([08e9d9d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/08e9d9d6a09f450a9f512ac2789287819ced9641))
+
+## [1.14.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.14.0-beta.8...v1.14.0-beta.9) (2024-08-17)
+
+
+### Features
+
+* update model tokens dict ([0aca287](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0aca28732b249ffaedf5b665cbfb5b1255c0cc74))
+
 ## [1.14.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.14.0-beta.7...v1.14.0-beta.8) (2024-08-17)
 
 
 
@@ -9,14 +9,26 @@
 
 graph_config = {
     "llm": {
-        "model": "ollama/llama3",
+        "model": "ollama/llama3.1:8b",
         "temperature": 0,
         "format": "json",  # Ollama needs the format to be specified explicitly
         # "base_url": "http://localhost:11434", # set ollama URL arbitrarily
     },
 
     "verbose": True,
-    "headless": False
+    "headless": False,
+    "filter_config": {
+        "diff_domain_filter": True,
+        # "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
+        # "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
+        # "irrelevant_keywords": [
+        #         '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com', 
+        #         'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/', 
+        #         '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about', 
+        #         '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help',
+        #         '.pdf', '.zip', '/news', '/files', '/downloads'
+        #     ]
+    },
 }
 
 # ************************************************
 
@@ -2,7 +2,7 @@
 name = "scrapegraphai"
 
 
-version = "1.14.0b8"
+version = "1.14.0b10"
 
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 
@@ -72,7 +72,9 @@ def _create_graph(self) -> BaseGraph:
             output=["parsed_doc"],
             node_config={
                 "llm_model": self.llm_model,
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "filter_links": self.config.get("filter_links", None),
+                "filter_config": self.config.get("filter_config", None)
             }
         )
 
 
@@ -0,0 +1,13 @@
+""" 
+Module for filtering irrelevant links
+"""
+
+filter_dict = {
+    "diff_domain_filter": True,
+    "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
+    "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
+    "irrelevant_keywords": [
+            '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com', 
+            'linkedin.com', 'instagram.com', '.js', '.css',
+        ]
+}
@@ -1,3 +1,7 @@
+"""
+List of model tokens
+"""
+
 models_tokens = {
     "openai": {
         "gpt-3.5-turbo-0125": 16385,
@@ -47,44 +51,42 @@
         "gemini-1.5-pro-latest": 128000,
         "models/embedding-001": 2048
     },
-    "ollama": { 
-        "grok-1": 8192,
-        "command-r": 12800, 
-        "codellama": 16000, 
-        "dbrx": 32768, 
-        "deepseek-coder:33b": 16000, 
-        "falcon": 2048, 
-        "llama2": 4096, 
-        "llama3": 8192, 
-        "llama3:70b": 8192,
-        "llama3.1":128000,
-        "llama3.1:70b": 128000,
-        "lama3.1:405b": 128000,
-        "scrapegraph": 8192, 
-        "llava": 4096, 
-        "mixtral:8x22b-instruct": 65536, 
-        "mistral":8192,
-        "mistral-openorca": 32000, 
-        "nomic-embed-text": 8192, 
-        "nous-hermes2:34b": 4096, 
-        "orca-mini": 2048, 
-        "phi3:3.8b": 12800, 
-        "qwen:0.5b": 32000, 
-        "qwen:1.8b": 32000, 
-        "qwen:4b": 32000, 
-        "qwen:14b": 32000, 
-        "qwen:32b": 32000, 
-        "qwen:72b": 32000, 
-        "qwen:110b": 32000, 
-        "stablelm-zephyr": 8192, 
-        "wizardlm2:8x22b": 65536, 
-        # embedding models
-        "shaw/dmeta-embedding-zh-small-q4": 8192,
-        "shaw/dmeta-embedding-zh-q4": 8192,
-        "chevalblanc/acge_text_embedding": 8192,
-        "martcreation/dmeta-embedding-zh": 8192,
-        "snowflake-arctic-embed": 8192, 
-        "mxbai-embed-large": 512 
+    "ollama": { "command-r": 12800, 
+               "codellama": 16000, 
+               "dbrx": 32768, 
+               "deepseek-coder:33b": 16000, 
+               "falcon": 2048, 
+               "llama2": 4096, 
+               "llama3": 8192, 
+               "llama3:70b": 8192,
+               "llama3.1":128000,
+               "llama3.1:8b": 128000,
+               "llama3.1:70b": 128000,
+               "lama3.1:405b": 128000,
+               "scrapegraph": 8192, 
+               "llava": 4096, 
+               "mixtral:8x22b-instruct": 65536, 
+               "mistral-openorca": 32000, 
+               "nomic-embed-text": 8192, 
+               "nous-hermes2:34b": 4096, 
+               "orca-mini": 2048, 
+               "phi3:3.8b": 12800, 
+               "qwen:0.5b": 32000, 
+               "qwen:1.8b": 32000, 
+               "qwen:4b": 32000, 
+               "qwen:14b": 32000, 
+               "qwen:32b": 32000, 
+               "qwen:72b": 32000, 
+               "qwen:110b": 32000, 
+               "stablelm-zephyr": 8192, 
+               "wizardlm2:8x22b": 65536, 
+               # embedding models
+               "shaw/dmeta-embedding-zh-small-q4": 8192,
+               "shaw/dmeta-embedding-zh-q4": 8192,
+               "chevalblanc/acge_text_embedding": 8192,
+               "martcreation/dmeta-embedding-zh": 8192,
+               "snowflake-arctic-embed": 8192, 
+               "mxbai-embed-large": 512 
     },
     "oneapi": {
         "qwen-turbo": 6000 
 
@@ -5,13 +5,13 @@
 import re
 from abc import ABC, abstractmethod
 from typing import List, Optional
-
 from ..utils import get_logger
 
 
 class BaseNode(ABC):
     """
-    An abstract base class for nodes in a graph-based workflow, designed to perform specific actions when executed.
+    An abstract base class for nodes in a graph-based workflow, 
+    designed to perform specific actions when executed.
 
     Attributes:
         node_name (str): The unique identifier name for the node.
 
@@ -1,7 +1,6 @@
 """"
 FetchNode Module
 """
-
 import json
 from typing import List, Optional
 from langchain_openai import ChatOpenAI, AzureChatOpenAI
@@ -16,10 +15,6 @@
 from ..utils.logging import get_logger
 from .base_node import BaseNode
 
-
-""""
-FetchNode Module
-"""
 class FetchNode(BaseNode):
     """
     A node responsible for fetching the HTML content of a specified URL and updating
@@ -218,7 +213,7 @@ def handle_local_source(self, state, source):
         self.logger.info(f"--- (Fetching HTML from: {source}) ---")
         if not source.strip():
             raise ValueError("No HTML body content found in the local source.")
-  
+
         parsed_content = source
 
         if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator:
 
@@ -1,5 +1,4 @@
 """
-gg
 Module for generating the answer node
 """
 
@@ -10,8 +9,7 @@
 from tqdm import tqdm
 from ..utils.logging import get_logger
 from .base_node import BaseNode
-from ..prompts.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv
-
+from ..prompts.generate_answer_node_csv_prompts import TEMPLATE_CHUKS_CSV, TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV
 
 class GenerateAnswerCSVNode(BaseNode):
     """
@@ -97,22 +95,22 @@ def execute(self, state):
         else:
             output_parser = JsonOutputParser()
 
-        template_no_chunks_csv_prompt = template_no_chunks_csv
-        template_chunks_csv_prompt = template_chunks_csv
-        template_merge_csv_prompt  = template_merge_csv
+        TEMPLATE_NO_CHUKS_CSV_prompt = TEMPLATE_NO_CHUKS_CSV
+        TEMPLATE_CHUKS_CSV_prompt = TEMPLATE_CHUKS_CSV
+        TEMPLATE_MERGE_CSV_prompt  = TEMPLATE_MERGE_CSV
 
         if self.additional_info is not None:
-            template_no_chunks_csv_prompt = self.additional_info + template_no_chunks_csv
-            template_chunks_csv_prompt = self.additional_info + template_chunks_csv
-            template_merge_csv_prompt = self.additional_info + template_merge_csv
+            TEMPLATE_NO_CHUKS_CSV_prompt = self.additional_info + TEMPLATE_NO_CHUKS_CSV
+            TEMPLATE_CHUKS_CSV_prompt = self.additional_info + TEMPLATE_CHUKS_CSV
+            TEMPLATE_MERGE_CSV_prompt = self.additional_info + TEMPLATE_MERGE_CSV
 
         format_instructions = output_parser.get_format_instructions()
 
         chains_dict = {}
 
         if len(doc) == 1:
             prompt = PromptTemplate(
-                template=template_no_chunks_csv_prompt,
+                template=TEMPLATE_NO_CHUKS_CSV_prompt,
                 input_variables=["question"],
                 partial_variables={
                     "context": doc,
@@ -129,7 +127,7 @@ def execute(self, state):
             tqdm(doc, desc="Processing chunks", disable=not self.verbose)
         ):
             prompt = PromptTemplate(
-                    template=template_chunks_csv_prompt,
+                    template=TEMPLATE_CHUKS_CSV_prompt,
                     input_variables=["question"],
                     partial_variables={
                         "context": chunk,
@@ -146,7 +144,7 @@ def execute(self, state):
         batch_results =  async_runner.invoke({"question": user_prompt})
 
         merge_prompt = PromptTemplate(
-                template = template_merge_csv_prompt,
+                template = TEMPLATE_MERGE_CSV_prompt,
                 input_variables=["context", "question"],
                 partial_variables={"format_instructions": format_instructions},
             )
 
@@ -10,7 +10,7 @@
 from tqdm import tqdm
 from ..utils.logging import get_logger
 from .base_node import BaseNode
-from ..prompts import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md
+from ..prompts import TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD
 
 class GenerateAnswerNode(BaseNode):
     """
@@ -98,23 +98,23 @@ def execute(self, state: dict) -> dict:
 
         format_instructions = output_parser.get_format_instructions()
 
-        template_no_chunks_prompt = template_no_chunks
-        template_chunks_prompt = template_chunks
-        template_merge_prompt = template_merge
-
         if  isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper:
-            template_no_chunks_prompt = template_no_chunks_md
-            template_chunks_prompt = template_chunks_md
-            template_merge_prompt = template_merge_md
+            template_no_chunks_prompt  = TEMPLATE_NO_CHUNKS_MD
+            template_chunks_prompt  = TEMPLATE_CHUNKS_MD
+            template_merge_prompt  = TEMPLATE_MERGE_MD
+        else:
+            template_no_chunks_prompt  = TEMPLATE_NO_CHUNKS
+            template_chunks_prompt  = TEMPLATE_CHUNKS
+            template_merge_prompt  = TEMPLATE_MERGE
 
         if self.additional_info is not None:
-            template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt
-            template_chunks_prompt = self.additional_info + template_chunks_prompt
-            template_merge_prompt = self.additional_info + template_merge_prompt
+            template_no_chunks_prompt  = self.additional_info + template_no_chunks_prompt
+            template_chunks_prompt  = self.additional_info + template_chunks_prompt
+            template_merge_prompt  = self.additional_info + template_merge_prompt 
 
         if len(doc) == 1:
             prompt = PromptTemplate(
-                template=template_no_chunks_prompt,
+                template=template_no_chunks_prompt ,
                 input_variables=["question"],
                 partial_variables={"context": doc,
                                     "format_instructions": format_instructions})
@@ -128,7 +128,7 @@ def execute(self, state: dict) -> dict:
         for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)):
 
             prompt = PromptTemplate(
-                template=template_chunks,
+                template=TEMPLATE_CHUNKS,
                 input_variables=["question"],
                 partial_variables={"context": chunk,
                                 "chunk_id": i + 1,
@@ -141,7 +141,7 @@ def execute(self, state: dict) -> dict:
         batch_results =  async_runner.invoke({"question": user_prompt})
 
         merge_prompt = PromptTemplate(
-                template = template_merge_prompt,
+                template = template_merge_prompt ,
                 input_variables=["context", "question"],
                 partial_variables={"format_instructions": format_instructions},
             )
 
@@ -8,8 +8,7 @@
 from tqdm import tqdm
 from langchain_community.chat_models import ChatOllama
 from .base_node import BaseNode
-from ..prompts.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni
-
+from ..prompts.generate_answer_node_omni_prompts import TEMPLATE_NO_CHUNKS_OMNI, TEMPLATE_CHUNKS_OMNI, TEMPLATE_MERGE_OMNI
 
 class GenerateAnswerOmniNode(BaseNode):
     """
@@ -82,22 +81,22 @@ def execute(self, state: dict) -> dict:
             output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
         else:
             output_parser = JsonOutputParser()
-        template_no_chunk_omni_prompt = template_no_chunk_omni
-        template_chunks_omni_prompt = template_chunks_omni
-        template_merge_omni_prompt= template_merge_omni
+        TEMPLATE_NO_CHUNKS_OMNI_prompt = TEMPLATE_NO_CHUNKS_OMNI
+        TEMPLATE_CHUNKS_OMNI_prompt = TEMPLATE_CHUNKS_OMNI
+        TEMPLATE_MERGE_OMNI_prompt= TEMPLATE_MERGE_OMNI
 
         if self.additional_info is not None:
-            template_no_chunk_omni_prompt = self.additional_info + template_no_chunk_omni_prompt
-            template_chunks_omni_prompt = self.additional_info + template_chunks_omni_prompt
-            template_merge_omni_prompt = self.additional_info + template_merge_omni_prompt
+            TEMPLATE_NO_CHUNKS_OMNI_prompt = self.additional_info + TEMPLATE_NO_CHUNKS_OMNI_prompt
+            TEMPLATE_CHUNKS_OMNI_prompt = self.additional_info + TEMPLATE_CHUNKS_OMNI_prompt
+            TEMPLATE_MERGE_OMNI_prompt = self.additional_info + TEMPLATE_MERGE_OMNI_prompt
 
         format_instructions = output_parser.get_format_instructions()
 
 
         chains_dict = {}
         if len(doc) == 1:
             prompt = PromptTemplate(
-                template=template_no_chunk_omni_prompt,
+                template=TEMPLATE_NO_CHUNKS_OMNI_prompt,
                 input_variables=["question"],
                 partial_variables={
                     "context": doc,
@@ -116,7 +115,7 @@ def execute(self, state: dict) -> dict:
             tqdm(doc, desc="Processing chunks", disable=not self.verbose)
         ):
             prompt = PromptTemplate(
-                    template=template_chunks_omni_prompt,
+                    template=TEMPLATE_CHUNKS_OMNI_prompt,
                     input_variables=["question"],
                     partial_variables={
                         "context": chunk,
@@ -134,7 +133,7 @@ def execute(self, state: dict) -> dict:
         batch_results =  async_runner.invoke({"question": user_prompt})
 
         merge_prompt = PromptTemplate(
-                template = template_merge_omni_prompt,
+                template = TEMPLATE_MERGE_OMNI_prompt,
                 input_variables=["context", "question"],
                 partial_variables={"format_instructions": format_instructions},
             )
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,9 @@ def _create_graph(self) -> BaseGraph:`
`72`	`72`	`output=["parsed_doc"],`
`73`	`73`	`node_config={`
`74`	`74`	`"llm_model": self.llm_model,`
`75`		`- "chunk_size": self.model_token`
	`75`	`+ "chunk_size": self.model_token,`
	`76`	`+ "filter_links": self.config.get("filter_links", None),`
	`77`	`+ "filter_config": self.config.get("filter_config", None)`
`76`	`78`	`}`
`77`	`79`	`)`
`78`	`80`