add chunking functionn

VinciGit00 · VinciGit00 · commit 380174d49033 · 2024-09-10T13:52:15.000+02:00
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -6,6 +6,7 @@
 from langchain_community.document_transformers import Html2TextTransformer
 from langchain_core.documents import Document
 from .base_node import BaseNode
+from tokenizer import num_tokens_calculus
 
 class ParseNode(BaseNode):
     """
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
@@ -11,6 +11,9 @@
 from .cleanup_html import cleanup_html
 from .logging import *
 from .convert_to_md import convert_to_md
-from .screenshot_scraping.screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
+from .screenshot_scraping.screenshot_preparation import (take_screenshot,
+                                                         select_area_with_opencv,
+                                                         select_area_with_ipywidget,
+                                                         crop_image)
 from .screenshot_scraping.text_detection import detect_text
-from .token_calculator import *
+from .tokenizer import num_tokens_calculus
diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py
diff --git a/scrapegraphai/utils/tokenizer.py b/scrapegraphai/utils/tokenizer.py
@@ -0,0 +1,10 @@
+"""
+Module for calculting the token_for_openai
+"""
+import tiktoken
+
+def num_tokens_calculus(string: str) -> int:
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.get_encoding("cl100k_base")
+    num_tokens = len(encoding.encode(string))
+    return num_tokens