Skip to content

Commit 5ec2de9

Browse files
committed
fix(chunking): count tokens from words instead of characters
closes #513
1 parent 7f1f750 commit 5ec2de9

File tree

1 file changed

+6
-6
lines changed

1 file changed

+6
-6
lines changed

scrapegraphai/nodes/parse_node.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,22 +74,22 @@ def execute(self, state: dict) -> dict:
7474
docs_transformed = docs_transformed[0]
7575

7676
chunks = chunk(text=docs_transformed.page_content,
77-
chunk_size= self.node_config.get("chunk_size", 4096)-250,
78-
token_counter= lambda x: len(x),
77+
chunk_size=self.node_config.get("chunk_size", 4096)-250,
78+
token_counter=lambda text: len(text.split()),
7979
memoize=False)
8080
else:
8181
docs_transformed = docs_transformed[0]
8282

8383
if isinstance(docs_transformed, Document):
8484
chunks = chunk(text=docs_transformed.page_content,
85-
chunk_size= self.node_config.get("chunk_size", 4096)-250,
86-
token_counter= lambda x: len(x),
85+
chunk_size=self.node_config.get("chunk_size", 4096)-250,
86+
token_counter=lambda text: len(text.split()),
8787
memoize=False)
8888
else:
8989

9090
chunks = chunk(text=docs_transformed,
91-
chunk_size= self.node_config.get("chunk_size", 4096)-250,
92-
token_counter= lambda x: len(x),
91+
chunk_size=self.node_config.get("chunk_size", 4096)-250,
92+
token_counter=lambda text: len(text.split()),
9393
memoize=False)
9494

9595
state.update({self.output[0]: chunks})

0 commit comments

Comments
 (0)