Skip to content

Commit 8cb9646

Browse files
committed
Merge branch 'main' into pre/beta
2 parents 9266a36 + 58b1133 commit 8cb9646

24 files changed

+75
-19
lines changed

.github/FUNDING.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# These are supported funding model platforms
2+
3+
github: ScrapeGraphAI
4+
patreon: # Replace with a single Patreon username
5+
open_collective:
6+
ko_fi: # Replace with a single Ko-fi username
7+
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8+
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9+
liberapay: # Replace with a single Liberapay username
10+
issuehunt: # Replace with a single IssueHunt username
11+
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12+
polar: # Replace with a single Polar username
13+
buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
14+
thanks_dev: # Replace with a single thanks.dev username
15+
custom:

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,18 @@
55

66
* add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3))
77

8-
## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14)
98

9+
## [1.26.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6) (2024-10-18)
10+
11+
## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14)
1012

1113
### Bug Fixes
1214

1315
* remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918))
1416

17+
* refactoring of gpt2 tokenizer ([44c3f9c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/44c3f9c98939c44caa86dc582242819a7c6a0f80))
18+
>>>>>>> main
19+
1520
## [1.26.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.4...v1.26.5) (2024-10-13)
1621

1722

examples/extras/.env.example

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
OPENAI_API_KEY="OPENAI_API_KEY"
2-
BROWSER_BASE_PROJECT_ID="BROWSER_BASE_PROJECT_ID"
3-
BROWSER_BASE_API_KEY="BROWSERBASE_API_KEY"
1+
OPENAI_API_KEY="YOUR_OPENAI_API_KEY"
2+
BROWSER_BASE_PROJECT_ID="YOUR_BROWSER_BASE_PROJECT_ID"
3+
BROWSER_BASE_API_KEY="YOUR_BROWSERBASE_API_KEY"
4+
SCRAPE_DO_API_KEY="YOUR_SCRAPE_DO_API_KEY"

examples/openai/smart_scraper_openai.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
# ************************************************
2929

3030
smart_scraper_graph = SmartScraperGraph(
31-
prompt="List me what does the company do, the name and a contact email.",
32-
source="https://scrapegraphai.com/",
31+
prompt="Extract me all the articles",
32+
source="https://www.wired.com",
3333
config=graph_config
3434
)
3535

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ name = "scrapegraphai"
33

44
version = "1.27.0b1"
55

6+
67
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
78
authors = [
89
{ name = "Marco Vinciguerra", email = "[email protected]" },

scrapegraphai/graphs/abstract_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ def _create_llm(self, llm_config: dict) -> object:
128128
if requests_per_second is not None:
129129
with warnings.catch_warnings():
130130
warnings.simplefilter("ignore")
131-
llm_params["rate_limiter"] = InMemoryRateLimiter(requests_per_second=requests_per_second)
131+
llm_params["rate_limiter"] = InMemoryRateLimiter(
132+
requests_per_second=requests_per_second)
132133
if max_retries is not None:
133134
llm_params["max_retries"] = max_retries
134135

scrapegraphai/graphs/base_graph.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def __init__(self, nodes: list, edges: list, entry_point: str,
5959
# raise a warning if the entry point is not the first node in the list
6060
warnings.warn(
6161
"Careful! The entry point node is different from the first node in the graph.")
62-
62+
6363
self._set_conditional_node_edges()
6464

6565
# Burr configuration
@@ -89,11 +89,9 @@ def _set_conditional_node_edges(self):
8989
"""
9090
for node in self.nodes:
9191
if node.node_type == 'conditional_node':
92-
# Find outgoing edges from this ConditionalNode
9392
outgoing_edges = [(from_node, to_node) for from_node, to_node in self.raw_edges if from_node.node_name == node.node_name]
9493
if len(outgoing_edges) != 2:
9594
raise ValueError(f"ConditionalNode '{node.node_name}' must have exactly two outgoing edges.")
96-
# Assign true_node_name and false_node_name
9795
node.true_node_name = outgoing_edges[0][1].node_name
9896
try:
9997
node.false_node_name = outgoing_edges[1][1].node_name

scrapegraphai/graphs/code_generator_graph.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def _create_graph(self) -> BaseGraph:
9999
"schema": self.schema,
100100
}
101101
)
102+
102103
prompt_refier_node = PromptRefinerNode(
103104
input="user_prompt",
104105
output=["refined_prompt"],
@@ -108,6 +109,7 @@ def _create_graph(self) -> BaseGraph:
108109
"schema": self.schema
109110
}
110111
)
112+
111113
html_analyzer_node = HtmlAnalyzerNode(
112114
input="refined_prompt & original_html",
113115
output=["html_info", "reduced_html"],
@@ -118,6 +120,7 @@ def _create_graph(self) -> BaseGraph:
118120
"reduction": self.config.get("reduction", 0)
119121
}
120122
)
123+
121124
generate_code_node = GenerateCodeNode(
122125
input="user_prompt & refined_prompt & html_info & reduced_html & answer",
123126
output=["generated_code"],

scrapegraphai/graphs/csv_scraper_graph.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def _create_graph(self):
5959
"""
6060
Creates the graph of nodes representing the workflow for web scraping.
6161
"""
62+
6263
fetch_node = FetchNode(
6364
input="csv | csv_dir",
6465
output=["doc"],
@@ -90,6 +91,7 @@ def run(self) -> str:
9091
"""
9192
Executes the web scraping process and returns the answer to the prompt.
9293
"""
94+
9395
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
9496
self.final_state, self.execution_info = self.graph.execute(inputs)
9597

scrapegraphai/graphs/csv_scraper_multi_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def run(self) -> str:
9494
Returns:
9595
str: The answer to the prompt.
9696
"""
97+
9798
inputs = {"user_prompt": self.prompt, "jsons": self.source}
9899
self.final_state, self.execution_info = self.graph.execute(inputs)
99100

scrapegraphai/graphs/document_scraper_multi_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def run(self) -> str:
9494
Returns:
9595
str: The answer to the prompt.
9696
"""
97+
9798
inputs = {"user_prompt": self.prompt, "xmls": self.source}
9899
self.final_state, self.execution_info = self.graph.execute(inputs)
99100

scrapegraphai/graphs/json_scraper_multi_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ def run(self) -> str:
9595
Returns:
9696
str: The answer to the prompt.
9797
"""
98+
9899
inputs = {"user_prompt": self.prompt, "jsons": self.source}
99100
self.final_state, self.execution_info = self.graph.execute(inputs)
100101

scrapegraphai/graphs/omni_scraper_graph.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,15 @@ def _create_graph(self) -> BaseGraph:
6262
Returns:
6363
BaseGraph: A graph instance representing the web scraping workflow.
6464
"""
65+
6566
fetch_node = FetchNode(
6667
input="url | local_dir",
6768
output=["doc"],
6869
node_config={
6970
"loader_kwargs": self.config.get("loader_kwargs", {}),
7071
}
7172
)
73+
7274
parse_node = ParseNode(
7375
input="doc & (url | local_dir)",
7476
output=["parsed_doc", "link_urls", "img_urls"],
@@ -78,6 +80,7 @@ def _create_graph(self) -> BaseGraph:
7880
"llm_model": self.llm_model
7981
}
8082
)
83+
8184
image_to_text_node = ImageToTextNode(
8285
input="img_urls",
8386
output=["img_desc"],

scrapegraphai/graphs/omni_search_graph.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,6 @@ def _create_graph(self) -> BaseGraph:
5959
BaseGraph: A graph instance representing the web scraping and searching workflow.
6060
"""
6161

62-
# omni_scraper_instance = OmniScraperGraph(
63-
# prompt="",
64-
# source="",
65-
# config=self.copy_config,
66-
# schema=self.copy_schema
67-
# )
68-
6962
search_internet_node = SearchInternetNode(
7063
input="user_prompt",
7164
output=["urls"],
@@ -115,6 +108,7 @@ def run(self) -> str:
115108
Returns:
116109
str: The answer to the prompt.
117110
"""
111+
118112
inputs = {"user_prompt": self.prompt}
119113
self.final_state, self.execution_info = self.graph.execute(inputs)
120114

scrapegraphai/graphs/script_creator_multi_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ def run(self) -> str:
9191
Returns:
9292
str: The answer to the prompt.
9393
"""
94+
9495
inputs = {"user_prompt": self.prompt, "urls": self.source}
9596
self.final_state, self.execution_info = self.graph.execute(inputs)
9697
return self.final_state.get("merged_script", "Failed to generate the script.")

scrapegraphai/graphs/search_graph.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ def run(self) -> str:
110110
Returns:
111111
str: The answer to the prompt.
112112
"""
113+
113114
inputs = {"user_prompt": self.prompt}
114115
self.final_state, self.execution_info = self.graph.execute(inputs)
115116

@@ -126,4 +127,5 @@ def get_considered_urls(self) -> List[str]:
126127
Returns:
127128
List[str]: A list of URLs considered during the search.
128129
"""
130+
129131
return self.considered_urls

scrapegraphai/graphs/search_link_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def _create_graph(self) -> BaseGraph:
4747
Returns:
4848
BaseGraph: A graph instance representing the web scraping workflow.
4949
"""
50+
5051
fetch_node = FetchNode(
5152
input="url| local_dir",
5253
output=["doc"],

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def _create_graph(self) -> BaseGraph:
5959
Returns:
6060
BaseGraph: A graph instance representing the web scraping workflow.
6161
"""
62+
6263
fetch_node = FetchNode(
6364
input="url| local_dir",
6465
output=["doc"],

scrapegraphai/graphs/smart_scraper_multi_concat_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class SmartScraperMultiConcatGraph(AbstractGraph):
4141
... )
4242
>>> result = search_graph.run()
4343
"""
44-
44+
4545
def __init__(self, prompt: str, source: List[str],
4646
config: dict, schema: Optional[BaseModel] = None):
4747

@@ -122,6 +122,7 @@ def run(self) -> str:
122122
Returns:
123123
str: The answer to the prompt.
124124
"""
125+
125126
inputs = {"user_prompt": self.prompt, "urls": self.source}
126127
self.final_state, self.execution_info = self.graph.execute(inputs)
127128

scrapegraphai/graphs/smart_scraper_multi_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def run(self) -> str:
9696
Returns:
9797
str: The answer to the prompt.
9898
"""
99+
99100
inputs = {"user_prompt": self.prompt, "urls": self.source}
100101
self.final_state, self.execution_info = self.graph.execute(inputs)
101102

scrapegraphai/graphs/xml_scraper_multi_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ def run(self) -> str:
9393
Returns:
9494
str: The answer to the prompt.
9595
"""
96+
9697
inputs = {"user_prompt": self.prompt, "xmls": self.source}
9798
self.final_state, self.execution_info = self.graph.execute(inputs)
9899

scrapegraphai/utils/tokenizer.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from langchain_ollama import ChatOllama
77
from langchain_mistralai import ChatMistralAI
88
from langchain_core.language_models.chat_models import BaseChatModel
9+
from transformers import GPT2TokenizerFast
910

1011
def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
1112
"""
@@ -23,6 +24,13 @@ def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
2324
from .tokenizers.tokenizer_ollama import num_tokens_ollama
2425
num_tokens_fn = num_tokens_ollama
2526

27+
elif isinstance(llm_model, GPT2TokenizerFast):
28+
def num_tokens_gpt2(text: str, model: BaseChatModel) -> int:
29+
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
30+
tokens = tokenizer.encode(text)
31+
return len(tokens)
32+
num_tokens_fn = num_tokens_gpt2
33+
2634
else:
2735
from .tokenizers.tokenizer_openai import num_tokens_openai
2836
num_tokens_fn = num_tokens_openai

scrapegraphai/utils/tokenizers/tokenizer_ollama.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44
from langchain_core.language_models.chat_models import BaseChatModel
55
from ..logging import get_logger
6+
from transformers import GPT2TokenizerFast
67

78
def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
89
"""
@@ -21,8 +22,12 @@ def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
2122

2223
logger.debug(f"Counting tokens for text of {len(text)} characters")
2324

25+
if isinstance(llm_model, GPT2TokenizerFast):
26+
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
27+
tokens = tokenizer.encode(text)
28+
return len(tokens)
29+
2430
# Use langchain token count implementation
2531
# NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507
2632
tokens = llm_model.get_num_tokens(text)
2733
return tokens
28-

tests/graphs/smart_scraper_ollama_test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44
import pytest
55
from scrapegraphai.graphs import SmartScraperGraph
6+
from transformers import GPT2TokenizerFast
67

78

89
@pytest.fixture
@@ -50,3 +51,11 @@ def test_get_execution_info(graph_config: dict):
5051
graph_exec_info = smart_scraper_graph.get_execution_info()
5152

5253
assert graph_exec_info is not None
54+
55+
56+
def test_gpt2_tokenizer_loading():
57+
"""
58+
Test loading of GPT2TokenizerFast
59+
"""
60+
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
61+
assert tokenizer is not None

0 commit comments

Comments
 (0)