Skip to content

Commit 683bf57

Browse files
committed
fix(ParseNode): leave room for LLM reply in context window
1 parent ebdb749 commit 683bf57

File tree

6 files changed

+14
-11
lines changed

6 files changed

+14
-11
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@ authors = [
1414
]
1515

1616
dependencies = [
17-
"langchain==0.2.14",
17+
"langchain>=0.2.14",
1818
"langchain-fireworks>=0.1.3",
1919
"langchain_community>=0.2.9",
2020
"langchain-google-genai>=1.0.7",
2121
"langchain-google-vertexai>=1.0.7",
22-
"langchain-openai==0.1.22",
22+
"langchain-openai>=0.1.22",
2323
"langchain-groq>=0.1.3",
2424
"langchain-aws>=0.1.3",
2525
"langchain-anthropic>=0.1.11",

requirements-dev.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ graphviz==0.20.3
179179
# via scrapegraphai
180180
greenlet==3.0.3
181181
# via playwright
182+
# via sqlalchemy
182183
groq==0.9.0
183184
# via langchain-groq
184185
grpc-google-iam-v1==0.13.1

requirements.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ graphviz==0.20.3
133133
# via scrapegraphai
134134
greenlet==3.0.3
135135
# via playwright
136+
# via sqlalchemy
136137
groq==0.9.0
137138
# via langchain-groq
138139
grpc-google-iam-v1==0.13.1

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
langchain>=0.2.10
1+
langchain>=0.2.14
22
langchain-fireworks>=0.1.3
33
langchain_community>=0.2.9
44
langchain-google-genai>=1.0.7
55
langchain-google-vertexai>=1.0.7
6-
langchain-openai>=0.1.17
6+
langchain-openai>=0.1.22
77
langchain-groq>=0.1.3
88
langchain-aws>=0.1.3
99
langchain-anthropic>=0.1.11

scrapegraphai/nodes/parse_node.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from semchunk import chunk
77
from langchain_community.document_transformers import Html2TextTransformer
88
from langchain_core.documents import Document
9-
from ..utils.logging import get_logger
109
from .base_node import BaseNode
1110

1211
class ParseNode(BaseNode):
@@ -79,16 +78,18 @@ def execute(self, state: dict) -> dict:
7978
else:
8079
docs_transformed = docs_transformed[0]
8180

81+
# Adapt the chunk size, leaving room for the reply, the prompt and the schema
82+
chunk_size = self.node_config.get("chunk_size", 4096)
83+
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
84+
8285
if isinstance(docs_transformed, Document):
83-
8486
chunks = chunk(text=docs_transformed.page_content,
85-
chunk_size=self.node_config.get("chunk_size", 4096)-250,
87+
chunk_size=chunk_size,
8688
token_counter=lambda text: len(text.split()),
8789
memoize=False)
8890
else:
89-
9091
chunks = chunk(text=docs_transformed,
91-
chunk_size=self.node_config.get("chunk_size", 4096)-250,
92+
chunk_size=chunk_size,
9293
token_counter=lambda text: len(text.split()),
9394
memoize=False)
9495

scrapegraphai/utils/token_calculator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Module for truncatinh in chunks the messages
2+
Module for truncating in chunks the messages
33
"""
44
from typing import List
55
import tiktoken
@@ -27,7 +27,7 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]
2727
"""
2828

2929
encoding = tiktoken.get_encoding(encoding_name)
30-
max_tokens = models_tokens[model] - 500
30+
max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9))
3131
encoded_text = encoding.encode(text)
3232

3333
chunks = [encoded_text[i:i + max_tokens]

0 commit comments

Comments
 (0)