Skip to content

Commit fd0a902

Browse files
authored
Merge pull request #619 from tm-robinson/543-ScriptCreatorGraph-only-use-first-chunk
543 script creator graph only use first chunk
2 parents ba5c7ad + e741602 commit fd0a902

14 files changed

+73
-22
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
## [1.16.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0-beta.3...v1.16.0-beta.4) (2024-09-02)
1+
## [1.16.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.2...v1.16.0) (2024-09-01)
2+
23

34

45
### Features
@@ -11,6 +12,9 @@
1112
* deepcopy fail for coping model_instance config ([cd07418](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cd07418474112cecd53ab47866262f2f31294223))
1213
* fix pydantic object copy ([553527a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/553527a269cdd70c0c174ad5c78cbf35c00b22c1))
1314

15+
## [1.15.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.1...v1.15.2) (2024-09-01)
16+
17+
1418
## [1.16.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0-beta.2...v1.16.0-beta.3) (2024-09-01)
1519

1620

@@ -27,6 +31,7 @@
2731

2832

2933

34+
3035
### Bug Fixes
3136

3237
* pyproject.toml ([360ce1c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/360ce1c0e468c959e63555120ac7cecf55563846))

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
[project]
22
name = "scrapegraphai"
3+
34
version = "1.16.0b4"
45

56
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."

scrapegraphai/graphs/abstract_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def _create_llm(self, llm_config: dict) -> object:
141141
try:
142142
self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]]
143143
except KeyError:
144-
print("Model not found, using default token size (8192)")
144+
print(f"Model {llm_params['model_provider']}/{llm_params['model']} not found, using default token size (8192)")
145145
self.model_token = 8192
146146

147147
try:

scrapegraphai/graphs/deep_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ def _create_repeated_graph(self) -> BaseGraph:
7575
input="doc",
7676
output=["parsed_doc"],
7777
node_config={
78-
"chunk_size": self.model_token
78+
"chunk_size": self.model_token,
79+
"llm_model": self.llm_model
7980
}
8081
)
8182

scrapegraphai/graphs/markdown_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ def _create_graph(self) -> BaseGraph:
6060
output=["parsed_doc"],
6161
node_config={
6262
"parse_html": False,
63-
"chunk_size": self.model_token
63+
"chunk_size": self.model_token,
64+
"llm_model": self.llm_model
6465
}
6566
)
6667
generate_answer_node = GenerateAnswerNode(

scrapegraphai/graphs/omni_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ def _create_graph(self) -> BaseGraph:
7474
input="doc",
7575
output=["parsed_doc"],
7676
node_config={
77-
"chunk_size": self.model_token
77+
"chunk_size": self.model_token,
78+
"llm_model": self.llm_model
7879
}
7980
)
8081
image_to_text_node = ImageToTextNode(

scrapegraphai/graphs/pdf_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
6868
output=["parsed_doc"],
6969
node_config={
7070
"parse_html": False,
71-
"chunk_size": self.model_token
71+
"chunk_size": self.model_token,
72+
"llm_model": self.llm_model
7273
}
7374
)
7475

scrapegraphai/graphs/script_creator_graph.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,12 @@ def _create_graph(self) -> BaseGraph:
7373
input="doc",
7474
output=["parsed_doc"],
7575
node_config={"chunk_size": self.model_token,
76-
"parse_html": False
76+
"parse_html": False,
77+
"llm_model": self.llm_model
7778
}
7879
)
7980
generate_scraper_node = GenerateScraperNode(
80-
input="user_prompt & (doc)",
81+
input="user_prompt & (parsed_doc)",
8182
output=["answer"],
8283
node_config={
8384
"llm_model": self.llm_model,

scrapegraphai/graphs/search_link_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ def _create_graph(self) -> BaseGraph:
6464
input="doc",
6565
output=["parsed_doc"],
6666
node_config={
67-
"chunk_size": self.model_token
67+
"chunk_size": self.model_token,
68+
"llm_model": self.llm_model
6869
}
6970
)
7071
search_link_node = SearchLinkNode(

scrapegraphai/graphs/speech_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
6868
input="doc",
6969
output=["parsed_doc"],
7070
node_config={
71-
"chunk_size": self.model_token
71+
"chunk_size": self.model_token,
72+
"llm_model": self.llm_model
7273
}
7374
)
7475
generate_answer_node = GenerateAnswerNode(

scrapegraphai/nodes/generate_scraper_node.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,19 @@ def execute(self, state: dict) -> dict:
102102
TEMPLATE_NO_CHUNKS += self.additional_info
103103

104104
if len(doc) > 1:
105-
raise NotImplementedError(
106-
"Currently GenerateScraperNode cannot handle more than 1 context chunks"
107-
)
105+
# Short term partial fix for issue #543 (Context length exceeded)
106+
# If there are more than one chunks returned by ParseNode we just use the first one
107+
# on the basis that the structure of the remainder of the HTML page is probably
108+
# very similar to the first chunk therefore the generated script should still work.
109+
# The better fix is to generate multiple scripts then use the LLM to merge them.
110+
111+
#raise NotImplementedError(
112+
# "Currently GenerateScraperNode cannot handle more than 1 context chunks"
113+
#)
114+
self.logger.warn(f"Warning: {self.node_name} Node provided with {len(doc)} chunks but can only "
115+
"support 1, ignoring remaining chunks")
116+
doc = [doc[0]]
117+
template = TEMPLATE_NO_CHUNKS
108118
else:
109119
template = TEMPLATE_NO_CHUNKS
110120

scrapegraphai/nodes/parse_node.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def __init__(
4040
self.parse_html = (
4141
True if node_config is None else node_config.get("parse_html", True)
4242
)
43+
self.llm_model = node_config['llm_model']
4344

4445
def execute(self, state: dict) -> dict:
4546
"""
@@ -64,31 +65,33 @@ def execute(self, state: dict) -> dict:
6465
input_data = [state[key] for key in input_keys]
6566
docs_transformed = input_data[0]
6667

68+
def count_tokens(text):
69+
from ..utils import token_count
70+
return token_count(text, self.llm_model.model_name)
71+
6772
if self.parse_html:
6873
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
6974
docs_transformed = docs_transformed[0]
7075

7176
chunks = chunk(text=docs_transformed.page_content,
7277
chunk_size=self.node_config.get("chunk_size", 4096)-250,
73-
token_counter=lambda text: len(text.split()),
78+
token_counter=count_tokens,
7479
memoize=False)
7580
else:
7681
docs_transformed = docs_transformed[0]
77-
7882
chunk_size = self.node_config.get("chunk_size", 4096)
7983
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
8084

8185
if isinstance(docs_transformed, Document):
8286
chunks = chunk(text=docs_transformed.page_content,
8387
chunk_size=chunk_size,
84-
token_counter=lambda text: len(text.split()),
88+
token_counter=count_tokens,
8589
memoize=False)
8690
else:
8791
chunks = chunk(text=docs_transformed,
8892
chunk_size=chunk_size,
89-
token_counter=lambda text: len(text.split()),
93+
token_counter=count_tokens,
9094
memoize=False)
9195

9296
state.update({self.output[0]: chunks})
93-
9497
return state

scrapegraphai/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@
1111
from .cleanup_html import cleanup_html
1212
from .logging import *
1313
from .convert_to_md import convert_to_md
14+
from .token_calculator import *

scrapegraphai/utils/token_calculator.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,26 @@
66
from ..helpers.models_tokens import models_tokens
77

88

9-
def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
9+
def truncate_text_tokens(text: str, model: str) -> List[str]:
1010
"""
1111
Truncates text into chunks that are small enough to be processed by specified llm models.
1212
1313
Args:
1414
text (str): The input text to be truncated.
1515
model (str): The name of the llm model to determine the maximum token limit.
16-
encoding_name (str): The encoding strategy used to encode the text before truncation.
1716
1817
Returns:
1918
List[str]: A list of text chunks, each within the token limit of the specified model.
2019
2120
Example:
22-
>>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING")
21+
>>> truncate_text_tokens("This is a sample text for truncation.", "gpt-4o-mini")
2322
["This is a sample text", "for truncation."]
2423
2524
This function ensures that each chunk of text can be tokenized
2625
by the specified model without exceeding the model's token limit.
2726
"""
2827

29-
encoding = tiktoken.get_encoding(encoding_name)
28+
encoding = tiktoken.encoding_for_model(model)
3029
max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9))
3130
encoded_text = encoding.encode(text)
3231

@@ -36,3 +35,28 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]
3635
result = [encoding.decode(chunk) for chunk in chunks]
3736

3837
return result
38+
39+
40+
def token_count(text: str, model: str) -> List[str]:
41+
"""
42+
Return the number of tokens within the text, based on the encoding of the specified model.
43+
44+
Args:
45+
text (str): The input text to be counted.
46+
model (str): The name of the llm model to determine the encoding.
47+
48+
Returns:
49+
int: Number of tokens.
50+
51+
Example:
52+
>>> token_count("This is a sample text for counting.", "gpt-4o-mini")
53+
9
54+
55+
This function ensures that each chunk of text can be tokenized
56+
by the specified model without exceeding the model's token limit.
57+
"""
58+
59+
encoding = tiktoken.encoding_for_model(model)
60+
num_tokens = len(encoding.encode(text))
61+
62+
return num_tokens

0 commit comments

Comments
 (0)