Skip to content

Commit 1a7f21f

Browse files
committed
feat: removed semchunk and used tikton
1 parent 380174d commit 1a7f21f

File tree

6 files changed

+48
-38
lines changed

6 files changed

+48
-38
lines changed

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ dependencies = [
2929
"playwright>=1.43.0",
3030
"undetected-playwright>=0.3.0",
3131
"google>=3.0.0",
32-
"semchunk>=1.0.1",
3332
"langchain-ollama>=0.1.3",
3433
]
3534

requirements-dev.lock

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ cycler==0.12.1
7171
dataclasses-json==0.6.7
7272
# via langchain-community
7373
dill==0.3.8
74-
# via multiprocess
7574
# via pylint
7675
distro==1.9.0
7776
# via openai
@@ -236,13 +235,9 @@ mdurl==0.1.2
236235
# via markdown-it-py
237236
minify-html==0.15.0
238237
# via scrapegraphai
239-
mpire==2.10.2
240-
# via semchunk
241238
multidict==6.0.5
242239
# via aiohttp
243240
# via yarl
244-
multiprocess==0.70.16
245-
# via mpire
246241
mypy-extensions==1.0.0
247242
# via typing-inspect
248243
narwhals==1.3.0
@@ -325,7 +320,6 @@ pyee==11.1.0
325320
# via playwright
326321
pygments==2.18.0
327322
# via furo
328-
# via mpire
329323
# via rich
330324
# via sphinx
331325
pylint==3.2.6
@@ -373,8 +367,6 @@ rsa==4.9
373367
# via google-auth
374368
s3transfer==0.10.2
375369
# via boto3
376-
semchunk==2.2.0
377-
# via scrapegraphai
378370
sf-hamilton==1.73.1
379371
# via burr
380372
six==1.16.0
@@ -436,10 +428,8 @@ tornado==6.4.1
436428
tqdm==4.66.5
437429
# via google-generativeai
438430
# via huggingface-hub
439-
# via mpire
440431
# via openai
441432
# via scrapegraphai
442-
# via semchunk
443433
typing-extensions==4.12.2
444434
# via altair
445435
# via anyio

requirements.lock

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,6 @@ charset-normalizer==3.3.2
4141
# via requests
4242
dataclasses-json==0.6.7
4343
# via langchain-community
44-
dill==0.3.8
45-
# via multiprocess
4644
distro==1.9.0
4745
# via openai
4846
exceptiongroup==1.2.2
@@ -155,13 +153,9 @@ marshmallow==3.21.3
155153
# via dataclasses-json
156154
minify-html==0.15.0
157155
# via scrapegraphai
158-
mpire==2.10.2
159-
# via semchunk
160156
multidict==6.0.5
161157
# via aiohttp
162158
# via yarl
163-
multiprocess==0.70.16
164-
# via mpire
165159
mypy-extensions==1.0.0
166160
# via typing-inspect
167161
numpy==1.26.4
@@ -211,8 +205,6 @@ pydantic-core==2.20.1
211205
# via pydantic
212206
pyee==11.1.0
213207
# via playwright
214-
pygments==2.18.0
215-
# via mpire
216208
pyparsing==3.1.2
217209
# via httplib2
218210
python-dateutil==2.9.0.post0
@@ -241,8 +233,6 @@ rsa==4.9
241233
# via google-auth
242234
s3transfer==0.10.2
243235
# via boto3
244-
semchunk==2.2.0
245-
# via scrapegraphai
246236
six==1.16.0
247237
# via python-dateutil
248238
sniffio==1.3.1
@@ -266,10 +256,8 @@ tokenizers==0.19.1
266256
tqdm==4.66.4
267257
# via google-generativeai
268258
# via huggingface-hub
269-
# via mpire
270259
# via openai
271260
# via scrapegraphai
272-
# via semchunk
273261
typing-extensions==4.12.2
274262
# via anyio
275263
# via google-generativeai

scrapegraphai/nodes/parse_node.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22
ParseNode Module
33
"""
44
from typing import List, Optional
5-
from semchunk import chunk
65
from langchain_community.document_transformers import Html2TextTransformer
76
from langchain_core.documents import Document
87
from .base_node import BaseNode
9-
from tokenizer import num_tokens_calculus
8+
from ..utils.split_text_into_chunks import split_text_into_chunks
109

1110
class ParseNode(BaseNode):
1211
"""
@@ -69,26 +68,20 @@ def execute(self, state: dict) -> dict:
6968
docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
7069
docs_transformed = docs_transformed[0]
7170

72-
chunks = chunk(text=docs_transformed.page_content,
73-
chunk_size=self.node_config.get("chunk_size", 4096)-250,
74-
token_counter=lambda text: len(text.split()),
75-
memoize=False)
71+
chunks = split_text_into_chunks(text=docs_transformed.page_content,
72+
chunk_size=self.node_config.get("chunk_size", 4096)-250)
7673
else:
7774
docs_transformed = docs_transformed[0]
7875

7976
chunk_size = self.node_config.get("chunk_size", 4096)
8077
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
8178

8279
if isinstance(docs_transformed, Document):
83-
chunks = chunk(text=docs_transformed.page_content,
84-
chunk_size=chunk_size,
85-
token_counter=lambda text: len(text.split()),
86-
memoize=False)
80+
chunks = split_text_into_chunks(text=docs_transformed.page_content,
81+
chunk_size=chunk_size)
8782
else:
88-
chunks = chunk(text=docs_transformed,
89-
chunk_size=chunk_size,
90-
token_counter=lambda text: len(text.split()),
91-
memoize=False)
83+
chunks = split_text_into_chunks(text=docs_transformed,
84+
chunk_size=chunk_size)
9285

9386
state.update({self.output[0]: chunks})
9487

scrapegraphai/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
__init__.py file for utils folder
33
"""
4-
54
from .convert_to_csv import convert_to_csv
65
from .convert_to_json import convert_to_json
76
from .prettify_exec_info import prettify_exec_info
@@ -17,3 +16,4 @@
1716
crop_image)
1817
from .screenshot_scraping.text_detection import detect_text
1918
from .tokenizer import num_tokens_calculus
19+
from .split_text_into_chunks import split_text_into_chunks
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""
2+
split_text_into_chunks module
3+
"""
4+
from typing import List
5+
from .tokenizer import num_tokens_calculus # Import the new tokenizing function
6+
7+
def split_text_into_chunks(text: str, chunk_size: int) -> List[str]:
8+
"""
9+
Splits the text into chunks based on the number of tokens.
10+
11+
Args:
12+
text (str): The text to split.
13+
chunk_size (int): The maximum number of tokens per chunk.
14+
15+
Returns:
16+
List[str]: A list of text chunks.
17+
"""
18+
tokens = num_tokens_calculus(text)
19+
if tokens <= chunk_size:
20+
return [text]
21+
22+
chunks = []
23+
current_chunk = []
24+
current_length = 0
25+
26+
words = text.split()
27+
for word in words:
28+
word_tokens = num_tokens_calculus(word)
29+
if current_length + word_tokens > chunk_size:
30+
chunks.append(' '.join(current_chunk))
31+
current_chunk = [word]
32+
current_length = word_tokens
33+
else:
34+
current_chunk.append(word)
35+
current_length += word_tokens
36+
37+
if current_chunk:
38+
chunks.append(' '.join(current_chunk))
39+
40+
return chunks

0 commit comments

Comments
 (0)