Skip to content

Commit 380174d

Browse files
committed
add chunking functionn
1 parent 38cba96 commit 380174d

File tree

4 files changed

+16
-64
lines changed

4 files changed

+16
-64
lines changed

scrapegraphai/nodes/parse_node.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from langchain_community.document_transformers import Html2TextTransformer
77
from langchain_core.documents import Document
88
from .base_node import BaseNode
9+
from tokenizer import num_tokens_calculus
910

1011
class ParseNode(BaseNode):
1112
"""

scrapegraphai/utils/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
from .cleanup_html import cleanup_html
1212
from .logging import *
1313
from .convert_to_md import convert_to_md
14-
from .screenshot_scraping.screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
14+
from .screenshot_scraping.screenshot_preparation import (take_screenshot,
15+
select_area_with_opencv,
16+
select_area_with_ipywidget,
17+
crop_image)
1518
from .screenshot_scraping.text_detection import detect_text
16-
from .token_calculator import *
19+
from .tokenizer import num_tokens_calculus

scrapegraphai/utils/token_calculator.py

Lines changed: 0 additions & 62 deletions
This file was deleted.

scrapegraphai/utils/tokenizer.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""
2+
Module for calculting the token_for_openai
3+
"""
4+
import tiktoken
5+
6+
def num_tokens_calculus(string: str) -> int:
7+
"""Returns the number of tokens in a text string."""
8+
encoding = tiktoken.get_encoding("cl100k_base")
9+
num_tokens = len(encoding.encode(string))
10+
return num_tokens

0 commit comments

Comments
 (0)