Skip to content

Commit dc4a76b

Browse files
committed
use semchunk by default as the other code is causing tokenizers to be called for every individual word which is very slow especially with the mistral tokenizer
1 parent da9726f commit dc4a76b

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

scrapegraphai/utils/split_text_into_chunks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .tokenizer import num_tokens_calculus # Import the new tokenizing function
66
from langchain_core.language_models.chat_models import BaseChatModel
77

8-
def split_text_into_chunks(text: str, chunk_size: int, model: BaseChatModel, use_semchunk=False) -> List[str]:
8+
def split_text_into_chunks(text: str, chunk_size: int, model: BaseChatModel, use_semchunk=True) -> List[str]:
99
"""
1010
Splits the text into chunks based on the number of tokens.
1111
@@ -20,7 +20,7 @@ def split_text_into_chunks(text: str, chunk_size: int, model: BaseChatModel, use
2020
if use_semchunk:
2121
from semchunk import chunk
2222
def count_tokens(text):
23-
return num_tokens_calculus(text, llm_model)
23+
return num_tokens_calculus(text, model)
2424

2525
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
2626

0 commit comments

Comments
 (0)