Merge pull request #660 from tm-robinson/651-add-tokenization-for-ollama-and-mistral

VinciGit00 · web-flow · commit 4a16f14b257d · 2024-09-12T10:22:20.000+02:00
651 add tokenization for ollama and mistral
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -40,7 +40,7 @@ class AbstractGraph(ABC):
         ...         return graph
         ...
         >>> my_graph = MyGraph("Example Graph", 
-        {"llm": {"model": "openai/gpt-3.5-turbo"}}, "example_source")
+        {"llm": {"model": "gpt-3.5-turbo"}}, "example_source")
         >>> result = my_graph.run()
     """
 
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -41,6 +41,9 @@ def __init__(
             True if node_config is None else node_config.get("parse_html", True)
         )
 
+        self.llm_model = node_config.get("llm_model")
+        self.chunk_size = node_config.get("chunk_size")
+
     def execute(self, state: dict) -> dict:
         """
         Executes the node's logic to parse the HTML document content and split it into chunks.
@@ -69,19 +72,21 @@ def execute(self, state: dict) -> dict:
             docs_transformed = docs_transformed[0]
 
             chunks = split_text_into_chunks(text=docs_transformed.page_content,
-                                            chunk_size=self.node_config.get("chunk_size", 4096)-250)
+                                            chunk_size=self.chunk_size-250, model=self.llm_model)
         else:
             docs_transformed = docs_transformed[0]
 
-            chunk_size = self.node_config.get("chunk_size", 4096)
+            chunk_size = self.chunk_size
             chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
 
             if isinstance(docs_transformed, Document):
                 chunks = split_text_into_chunks(text=docs_transformed.page_content,
-                                                chunk_size=chunk_size)
+                                                chunk_size=chunk_size,
+                                                model=self.llm_model)
             else:
                 chunks = split_text_into_chunks(text=docs_transformed,
-                                                chunk_size=chunk_size)
+                                                chunk_size=chunk_size,
+                                                model=self.llm_model)
 
         state.update({self.output[0]: chunks})
 
diff --git a/scrapegraphai/utils/split_text_into_chunks.py b/scrapegraphai/utils/split_text_into_chunks.py
@@ -3,8 +3,9 @@
 """
 from typing import List
 from .tokenizer import num_tokens_calculus  # Import the new tokenizing function
+from langchain_core.language_models.chat_models import BaseChatModel
 
-def split_text_into_chunks(text: str, chunk_size: int) -> List[str]:
+def split_text_into_chunks(text: str, chunk_size: int, model: BaseChatModel, use_semchunk=True) -> List[str]:
     """
     Splits the text into chunks based on the number of tokens.
 
@@ -15,26 +16,43 @@ def split_text_into_chunks(text: str, chunk_size: int) -> List[str]:
     Returns:
         List[str]: A list of text chunks.
     """
-    tokens = num_tokens_calculus(text)
-    if tokens <= chunk_size:
-        return [text]
-
-    chunks = []
-    current_chunk = []
-    current_length = 0
-
-    words = text.split()
-    for word in words:
-        word_tokens = num_tokens_calculus(word)
-        if current_length + word_tokens > chunk_size:
-            chunks.append(' '.join(current_chunk))
-            current_chunk = [word]
-            current_length = word_tokens
-        else:
-            current_chunk.append(word)
-            current_length += word_tokens
 
-    if current_chunk:
-        chunks.append(' '.join(current_chunk))
+    if use_semchunk:
+        from semchunk import chunk
+        def count_tokens(text):
+            return num_tokens_calculus(text, model)
+
+        chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
+
+        chunks = chunk(text=text,
+                        chunk_size=chunk_size,
+                        token_counter=count_tokens,
+                        memoize=False)
+        return chunks
+
+    else:
+                
+        tokens = num_tokens_calculus(text, model)
+
+        if tokens <= chunk_size:
+            return [text]
+
+        chunks = []
+        current_chunk = []
+        current_length = 0
+
+        words = text.split()
+        for word in words:
+            word_tokens = num_tokens_calculus(word, model)
+            if current_length + word_tokens > chunk_size:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [word]
+                current_length = word_tokens
+            else:
+                current_chunk.append(word)
+                current_length += word_tokens
+
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
 
-    return chunks
+        return chunks
diff --git a/scrapegraphai/utils/tokenizer.py b/scrapegraphai/utils/tokenizer.py
@@ -1,10 +1,29 @@
+""" 
+Module for counting tokens and splitting text into chunks
 """
-Module for calculting the token_for_openai
-"""
-import tiktoken
+from typing import List
+from langchain_openai import ChatOpenAI
+from langchain_ollama import ChatOllama
+from langchain_mistralai import ChatMistralAI
+from langchain_core.language_models.chat_models import BaseChatModel
 
-def num_tokens_calculus(string: str) -> int:
+def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
     """Returns the number of tokens in a text string."""
-    encoding = tiktoken.get_encoding("cl100k_base")
-    num_tokens = len(encoding.encode(string))
+
+    if isinstance(llm_model, ChatOpenAI):
+        from .tokenizers.tokenizer_openai import num_tokens_openai
+        num_tokens_fn = num_tokens_openai
+
+    elif isinstance(llm_model, ChatMistralAI):
+        from .tokenizers.tokenizer_mistral import num_tokens_mistral
+        num_tokens_fn = num_tokens_mistral
+
+    elif isinstance(llm_model, ChatOllama):
+        from .tokenizers.tokenizer_ollama import num_tokens_ollama
+        num_tokens_fn = num_tokens_ollama
+
+    else:
+        raise NotImplementedError(f"There is no tokenization implementation for model '{llm_model}'")
+            
+    num_tokens = num_tokens_fn(string, llm_model)
     return num_tokens
diff --git a/scrapegraphai/utils/tokenizers/tokenizer_mistral.py b/scrapegraphai/utils/tokenizers/tokenizer_mistral.py
@@ -0,0 +1,46 @@
+"""
+Tokenization utilities for Mistral models
+"""
+from mistral_common.protocol.instruct.messages import UserMessage
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.protocol.instruct.tool_calls import Function, Tool
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+from langchain_core.language_models.chat_models import BaseChatModel
+from ..logging import get_logger
+
+
+def num_tokens_mistral(text: str, llm_model:BaseChatModel) -> int:
+    """
+    Estimate the number of tokens in a given text using Mistral's tokenization method,
+    adjusted for different Mistral models.
+
+    Args:
+        text (str): The text to be tokenized and counted.
+        llm_model (BaseChatModel): The specific Mistral model to adjust tokenization.
+
+    Returns:
+        int: The number of tokens in the text.
+    """
+
+    logger = get_logger()
+
+    logger.debug(f"Counting tokens for text of {len(text)} characters")
+    try:
+        model = llm_model.model
+    except AttributeError:
+        raise NotImplementedError(f"The model provider you are using ('{llm_model}') "
+            "does not give us a model name so we cannot identify which encoding to use")
+
+    tokenizer = MistralTokenizer.from_model(model)
+
+    tokenized = tokenizer.encode_chat_completion(
+        ChatCompletionRequest(
+            tools=[],
+            messages=[
+                UserMessage(content=text),
+            ],
+            model=model,
+        )
+    )
+    tokens = tokenized.tokens
+    return len(tokens)
diff --git a/scrapegraphai/utils/tokenizers/tokenizer_ollama.py b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py
@@ -0,0 +1,28 @@
+"""
+Tokenization utilities for Ollama models
+"""
+from langchain_core.language_models.chat_models import BaseChatModel
+from ..logging import get_logger
+
+def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
+    """
+    Estimate the number of tokens in a given text using Ollama's tokenization method,
+    adjusted for different Ollama models.
+
+    Args:
+        text (str): The text to be tokenized and counted.
+        llm_model (BaseChatModel): The specific Ollama model to adjust tokenization.
+
+    Returns:
+        int: The number of tokens in the text.
+    """
+
+    logger = get_logger()
+
+    logger.debug(f"Counting tokens for text of {len(text)} characters")
+
+    # Use langchain token count implementation
+    # NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507
+    tokens = llm_model.get_num_tokens(text)
+    return tokens
+
diff --git a/scrapegraphai/utils/tokenizers/tokenizer_openai.py b/scrapegraphai/utils/tokenizers/tokenizer_openai.py
@@ -0,0 +1,37 @@
+"""
+Tokenization utilities for OpenAI models
+"""
+import tiktoken
+from langchain_core.language_models.chat_models import BaseChatModel
+from ..logging import get_logger
+
+def num_tokens_openai(text: str, llm_model:BaseChatModel) -> int:
+    """
+    Estimate the number of tokens in a given text using OpenAI's tokenization method,
+    adjusted for different OpenAI models.
+
+    Args:
+        text (str): The text to be tokenized and counted.
+        llm_model (BaseChatModel): The specific OpenAI model to adjust tokenization.
+
+    Returns:
+        int: The number of tokens in the text.
+    """
+
+    logger = get_logger()
+
+    logger.debug(f"Counting tokens for text of {len(text)} characters")
+    try:
+        model = llm_model.model_name
+    except AttributeError:
+        raise NotImplementedError(f"The model provider you are using ('{llm_model}') "
+            "does not give us a model name so we cannot identify which encoding to use")
+
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        raise NotImplementedError(f"Tiktoken does not support identifying the encoding for "
+            "the model '{model}'")
+    
+    num_tokens = len(encoding.encode(text))
+    return num_tokens