Skip to content

Commit b48ee82

Browse files
committed
Merge branch 'pre/beta' into support_structured_output_shema_openai
2 parents 683bf57 + 6a08cc8 commit b48ee82

39 files changed

+378
-256
lines changed

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
## [1.14.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.14.0-beta.9...v1.14.0-beta.10) (2024-08-19)
2+
3+
4+
### Features
5+
6+
* Implemented a filter logic in search_link_node.py ([08e9d9d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/08e9d9d6a09f450a9f512ac2789287819ced9641))
7+
8+
## [1.14.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.14.0-beta.8...v1.14.0-beta.9) (2024-08-17)
9+
10+
11+
### Features
12+
13+
* update model tokens dict ([0aca287](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0aca28732b249ffaedf5b665cbfb5b1255c0cc74))
14+
115
## [1.14.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.14.0-beta.7...v1.14.0-beta.8) (2024-08-17)
216

317

examples/local_models/search_link_graph_ollama.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,26 @@
99

1010
graph_config = {
1111
"llm": {
12-
"model": "ollama/llama3",
12+
"model": "ollama/llama3.1:8b",
1313
"temperature": 0,
1414
"format": "json", # Ollama needs the format to be specified explicitly
1515
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
1616
},
1717

1818
"verbose": True,
19-
"headless": False
19+
"headless": False,
20+
"filter_config": {
21+
"diff_domain_filter": True,
22+
# "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
23+
# "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
24+
# "irrelevant_keywords": [
25+
# '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
26+
# 'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/',
27+
# '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about',
28+
# '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help',
29+
# '.pdf', '.zip', '/news', '/files', '/downloads'
30+
# ]
31+
},
2032
}
2133

2234
# ************************************************

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
name = "scrapegraphai"
33

44

5-
version = "1.14.0b8"
5+
version = "1.14.0b10"
66

77

88
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."

scrapegraphai/graphs/search_link_graph.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,9 @@ def _create_graph(self) -> BaseGraph:
7272
output=["parsed_doc"],
7373
node_config={
7474
"llm_model": self.llm_model,
75-
"chunk_size": self.model_token
75+
"chunk_size": self.model_token,
76+
"filter_links": self.config.get("filter_links", None),
77+
"filter_config": self.config.get("filter_config", None)
7678
}
7779
)
7880

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
"""
2+
Module for filtering irrelevant links
3+
"""
4+
5+
filter_dict = {
6+
"diff_domain_filter": True,
7+
"img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
8+
"lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
9+
"irrelevant_keywords": [
10+
'/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
11+
'linkedin.com', 'instagram.com', '.js', '.css',
12+
]
13+
}

scrapegraphai/helpers/models_tokens.py

Lines changed: 40 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
"""
2+
List of model tokens
3+
"""
4+
15
models_tokens = {
26
"openai": {
37
"gpt-3.5-turbo-0125": 16385,
@@ -47,44 +51,42 @@
4751
"gemini-1.5-pro-latest": 128000,
4852
"models/embedding-001": 2048
4953
},
50-
"ollama": {
51-
"grok-1": 8192,
52-
"command-r": 12800,
53-
"codellama": 16000,
54-
"dbrx": 32768,
55-
"deepseek-coder:33b": 16000,
56-
"falcon": 2048,
57-
"llama2": 4096,
58-
"llama3": 8192,
59-
"llama3:70b": 8192,
60-
"llama3.1":128000,
61-
"llama3.1:70b": 128000,
62-
"lama3.1:405b": 128000,
63-
"scrapegraph": 8192,
64-
"llava": 4096,
65-
"mixtral:8x22b-instruct": 65536,
66-
"mistral":8192,
67-
"mistral-openorca": 32000,
68-
"nomic-embed-text": 8192,
69-
"nous-hermes2:34b": 4096,
70-
"orca-mini": 2048,
71-
"phi3:3.8b": 12800,
72-
"qwen:0.5b": 32000,
73-
"qwen:1.8b": 32000,
74-
"qwen:4b": 32000,
75-
"qwen:14b": 32000,
76-
"qwen:32b": 32000,
77-
"qwen:72b": 32000,
78-
"qwen:110b": 32000,
79-
"stablelm-zephyr": 8192,
80-
"wizardlm2:8x22b": 65536,
81-
# embedding models
82-
"shaw/dmeta-embedding-zh-small-q4": 8192,
83-
"shaw/dmeta-embedding-zh-q4": 8192,
84-
"chevalblanc/acge_text_embedding": 8192,
85-
"martcreation/dmeta-embedding-zh": 8192,
86-
"snowflake-arctic-embed": 8192,
87-
"mxbai-embed-large": 512
54+
"ollama": { "command-r": 12800,
55+
"codellama": 16000,
56+
"dbrx": 32768,
57+
"deepseek-coder:33b": 16000,
58+
"falcon": 2048,
59+
"llama2": 4096,
60+
"llama3": 8192,
61+
"llama3:70b": 8192,
62+
"llama3.1":128000,
63+
"llama3.1:8b": 128000,
64+
"llama3.1:70b": 128000,
65+
"lama3.1:405b": 128000,
66+
"scrapegraph": 8192,
67+
"llava": 4096,
68+
"mixtral:8x22b-instruct": 65536,
69+
"mistral-openorca": 32000,
70+
"nomic-embed-text": 8192,
71+
"nous-hermes2:34b": 4096,
72+
"orca-mini": 2048,
73+
"phi3:3.8b": 12800,
74+
"qwen:0.5b": 32000,
75+
"qwen:1.8b": 32000,
76+
"qwen:4b": 32000,
77+
"qwen:14b": 32000,
78+
"qwen:32b": 32000,
79+
"qwen:72b": 32000,
80+
"qwen:110b": 32000,
81+
"stablelm-zephyr": 8192,
82+
"wizardlm2:8x22b": 65536,
83+
# embedding models
84+
"shaw/dmeta-embedding-zh-small-q4": 8192,
85+
"shaw/dmeta-embedding-zh-q4": 8192,
86+
"chevalblanc/acge_text_embedding": 8192,
87+
"martcreation/dmeta-embedding-zh": 8192,
88+
"snowflake-arctic-embed": 8192,
89+
"mxbai-embed-large": 512
8890
},
8991
"oneapi": {
9092
"qwen-turbo": 6000

scrapegraphai/nodes/base_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
import re
66
from abc import ABC, abstractmethod
77
from typing import List, Optional
8-
98
from ..utils import get_logger
109

1110

1211
class BaseNode(ABC):
1312
"""
14-
An abstract base class for nodes in a graph-based workflow, designed to perform specific actions when executed.
13+
An abstract base class for nodes in a graph-based workflow,
14+
designed to perform specific actions when executed.
1515
1616
Attributes:
1717
node_name (str): The unique identifier name for the node.

scrapegraphai/nodes/fetch_node.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
""""
22
FetchNode Module
33
"""
4-
54
import json
65
from typing import List, Optional
76
from langchain_openai import ChatOpenAI, AzureChatOpenAI
@@ -16,10 +15,6 @@
1615
from ..utils.logging import get_logger
1716
from .base_node import BaseNode
1817

19-
20-
""""
21-
FetchNode Module
22-
"""
2318
class FetchNode(BaseNode):
2419
"""
2520
A node responsible for fetching the HTML content of a specified URL and updating
@@ -218,7 +213,7 @@ def handle_local_source(self, state, source):
218213
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
219214
if not source.strip():
220215
raise ValueError("No HTML body content found in the local source.")
221-
216+
222217
parsed_content = source
223218

224219
if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator:

scrapegraphai/nodes/generate_answer_csv_node.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
"""
2-
gg
32
Module for generating the answer node
43
"""
54

@@ -10,8 +9,7 @@
109
from tqdm import tqdm
1110
from ..utils.logging import get_logger
1211
from .base_node import BaseNode
13-
from ..prompts.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv
14-
12+
from ..prompts.generate_answer_node_csv_prompts import TEMPLATE_CHUKS_CSV, TEMPLATE_NO_CHUKS_CSV, TEMPLATE_MERGE_CSV
1513

1614
class GenerateAnswerCSVNode(BaseNode):
1715
"""
@@ -97,22 +95,22 @@ def execute(self, state):
9795
else:
9896
output_parser = JsonOutputParser()
9997

100-
template_no_chunks_csv_prompt = template_no_chunks_csv
101-
template_chunks_csv_prompt = template_chunks_csv
102-
template_merge_csv_prompt = template_merge_csv
98+
TEMPLATE_NO_CHUKS_CSV_prompt = TEMPLATE_NO_CHUKS_CSV
99+
TEMPLATE_CHUKS_CSV_prompt = TEMPLATE_CHUKS_CSV
100+
TEMPLATE_MERGE_CSV_prompt = TEMPLATE_MERGE_CSV
103101

104102
if self.additional_info is not None:
105-
template_no_chunks_csv_prompt = self.additional_info + template_no_chunks_csv
106-
template_chunks_csv_prompt = self.additional_info + template_chunks_csv
107-
template_merge_csv_prompt = self.additional_info + template_merge_csv
103+
TEMPLATE_NO_CHUKS_CSV_prompt = self.additional_info + TEMPLATE_NO_CHUKS_CSV
104+
TEMPLATE_CHUKS_CSV_prompt = self.additional_info + TEMPLATE_CHUKS_CSV
105+
TEMPLATE_MERGE_CSV_prompt = self.additional_info + TEMPLATE_MERGE_CSV
108106

109107
format_instructions = output_parser.get_format_instructions()
110108

111109
chains_dict = {}
112110

113111
if len(doc) == 1:
114112
prompt = PromptTemplate(
115-
template=template_no_chunks_csv_prompt,
113+
template=TEMPLATE_NO_CHUKS_CSV_prompt,
116114
input_variables=["question"],
117115
partial_variables={
118116
"context": doc,
@@ -129,7 +127,7 @@ def execute(self, state):
129127
tqdm(doc, desc="Processing chunks", disable=not self.verbose)
130128
):
131129
prompt = PromptTemplate(
132-
template=template_chunks_csv_prompt,
130+
template=TEMPLATE_CHUKS_CSV_prompt,
133131
input_variables=["question"],
134132
partial_variables={
135133
"context": chunk,
@@ -146,7 +144,7 @@ def execute(self, state):
146144
batch_results = async_runner.invoke({"question": user_prompt})
147145

148146
merge_prompt = PromptTemplate(
149-
template = template_merge_csv_prompt,
147+
template = TEMPLATE_MERGE_CSV_prompt,
150148
input_variables=["context", "question"],
151149
partial_variables={"format_instructions": format_instructions},
152150
)

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from tqdm import tqdm
1111
from ..utils.logging import get_logger
1212
from .base_node import BaseNode
13-
from ..prompts import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md
13+
from ..prompts import TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD
1414

1515
class GenerateAnswerNode(BaseNode):
1616
"""
@@ -98,23 +98,23 @@ def execute(self, state: dict) -> dict:
9898

9999
format_instructions = output_parser.get_format_instructions()
100100

101-
template_no_chunks_prompt = template_no_chunks
102-
template_chunks_prompt = template_chunks
103-
template_merge_prompt = template_merge
104-
105101
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper:
106-
template_no_chunks_prompt = template_no_chunks_md
107-
template_chunks_prompt = template_chunks_md
108-
template_merge_prompt = template_merge_md
102+
template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD
103+
template_chunks_prompt = TEMPLATE_CHUNKS_MD
104+
template_merge_prompt = TEMPLATE_MERGE_MD
105+
else:
106+
template_no_chunks_prompt = TEMPLATE_NO_CHUNKS
107+
template_chunks_prompt = TEMPLATE_CHUNKS
108+
template_merge_prompt = TEMPLATE_MERGE
109109

110110
if self.additional_info is not None:
111-
template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt
112-
template_chunks_prompt = self.additional_info + template_chunks_prompt
113-
template_merge_prompt = self.additional_info + template_merge_prompt
111+
template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt
112+
template_chunks_prompt = self.additional_info + template_chunks_prompt
113+
template_merge_prompt = self.additional_info + template_merge_prompt
114114

115115
if len(doc) == 1:
116116
prompt = PromptTemplate(
117-
template=template_no_chunks_prompt,
117+
template=template_no_chunks_prompt ,
118118
input_variables=["question"],
119119
partial_variables={"context": doc,
120120
"format_instructions": format_instructions})
@@ -128,7 +128,7 @@ def execute(self, state: dict) -> dict:
128128
for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)):
129129

130130
prompt = PromptTemplate(
131-
template=template_chunks,
131+
template=TEMPLATE_CHUNKS,
132132
input_variables=["question"],
133133
partial_variables={"context": chunk,
134134
"chunk_id": i + 1,
@@ -141,7 +141,7 @@ def execute(self, state: dict) -> dict:
141141
batch_results = async_runner.invoke({"question": user_prompt})
142142

143143
merge_prompt = PromptTemplate(
144-
template = template_merge_prompt,
144+
template = template_merge_prompt ,
145145
input_variables=["context", "question"],
146146
partial_variables={"format_instructions": format_instructions},
147147
)

scrapegraphai/nodes/generate_answer_omni_node.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88
from tqdm import tqdm
99
from langchain_community.chat_models import ChatOllama
1010
from .base_node import BaseNode
11-
from ..prompts.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni
12-
11+
from ..prompts.generate_answer_node_omni_prompts import TEMPLATE_NO_CHUNKS_OMNI, TEMPLATE_CHUNKS_OMNI, TEMPLATE_MERGE_OMNI
1312

1413
class GenerateAnswerOmniNode(BaseNode):
1514
"""
@@ -82,22 +81,22 @@ def execute(self, state: dict) -> dict:
8281
output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
8382
else:
8483
output_parser = JsonOutputParser()
85-
template_no_chunk_omni_prompt = template_no_chunk_omni
86-
template_chunks_omni_prompt = template_chunks_omni
87-
template_merge_omni_prompt= template_merge_omni
84+
TEMPLATE_NO_CHUNKS_OMNI_prompt = TEMPLATE_NO_CHUNKS_OMNI
85+
TEMPLATE_CHUNKS_OMNI_prompt = TEMPLATE_CHUNKS_OMNI
86+
TEMPLATE_MERGE_OMNI_prompt= TEMPLATE_MERGE_OMNI
8887

8988
if self.additional_info is not None:
90-
template_no_chunk_omni_prompt = self.additional_info + template_no_chunk_omni_prompt
91-
template_chunks_omni_prompt = self.additional_info + template_chunks_omni_prompt
92-
template_merge_omni_prompt = self.additional_info + template_merge_omni_prompt
89+
TEMPLATE_NO_CHUNKS_OMNI_prompt = self.additional_info + TEMPLATE_NO_CHUNKS_OMNI_prompt
90+
TEMPLATE_CHUNKS_OMNI_prompt = self.additional_info + TEMPLATE_CHUNKS_OMNI_prompt
91+
TEMPLATE_MERGE_OMNI_prompt = self.additional_info + TEMPLATE_MERGE_OMNI_prompt
9392

9493
format_instructions = output_parser.get_format_instructions()
9594

9695

9796
chains_dict = {}
9897
if len(doc) == 1:
9998
prompt = PromptTemplate(
100-
template=template_no_chunk_omni_prompt,
99+
template=TEMPLATE_NO_CHUNKS_OMNI_prompt,
101100
input_variables=["question"],
102101
partial_variables={
103102
"context": doc,
@@ -116,7 +115,7 @@ def execute(self, state: dict) -> dict:
116115
tqdm(doc, desc="Processing chunks", disable=not self.verbose)
117116
):
118117
prompt = PromptTemplate(
119-
template=template_chunks_omni_prompt,
118+
template=TEMPLATE_CHUNKS_OMNI_prompt,
120119
input_variables=["question"],
121120
partial_variables={
122121
"context": chunk,
@@ -134,7 +133,7 @@ def execute(self, state: dict) -> dict:
134133
batch_results = async_runner.invoke({"question": user_prompt})
135134

136135
merge_prompt = PromptTemplate(
137-
template = template_merge_omni_prompt,
136+
template = TEMPLATE_MERGE_OMNI_prompt,
138137
input_variables=["context", "question"],
139138
partial_variables={"format_instructions": format_instructions},
140139
)

0 commit comments

Comments
 (0)