File tree Expand file tree Collapse file tree 5 files changed +16
-44
lines changed Expand file tree Collapse file tree 5 files changed +16
-44
lines changed Original file line number Diff line number Diff line change @@ -32,6 +32,7 @@ dependencies = [
32
32
" playwright==1.43.0" ,
33
33
" google==3.0.0" ,
34
34
" undetected-playwright==0.3.0" ,
35
+ " semchunk==1.0.1" ,
35
36
]
36
37
37
38
license = " MIT"
@@ -81,4 +82,4 @@ dev-dependencies = [
81
82
" pytest-mock==3.14.0" ,
82
83
" -e file:.[burr]" ,
83
84
" -e file:.[docs]" ,
84
- ]
85
+ ]
Original file line number Diff line number Diff line change @@ -30,9 +30,6 @@ anyio==4.3.0
30
30
# via openai
31
31
# via starlette
32
32
# via watchfiles
33
- async-timeout==4.0.3
34
- # via aiohttp
35
- # via langchain
36
33
attrs==23.2.0
37
34
# via aiohttp
38
35
# via jsonschema
@@ -51,7 +48,6 @@ botocore==1.34.113
51
48
# via boto3
52
49
# via s3transfer
53
50
burr==0.19.1
54
- # via burr
55
51
# via scrapegraphai
56
52
cachetools==5.3.3
57
53
# via google-auth
@@ -67,13 +63,6 @@ click==8.1.7
67
63
# via streamlit
68
64
# via typer
69
65
# via uvicorn
70
- colorama==0.4.6
71
- # via click
72
- # via loguru
73
- # via pytest
74
- # via sphinx
75
- # via tqdm
76
- # via uvicorn
77
66
contourpy==1.2.1
78
67
# via matplotlib
79
68
cycler==0.12.1
@@ -93,9 +82,6 @@ docutils==0.19
93
82
# via sphinx
94
83
email-validator==2.1.1
95
84
# via fastapi
96
- exceptiongroup==1.2.1
97
- # via anyio
98
- # via pytest
99
85
faiss-cpu==1.8.0
100
86
# via scrapegraphai
101
87
fastapi==0.111.0
@@ -150,7 +136,6 @@ graphviz==0.20.3
150
136
# via scrapegraphai
151
137
greenlet==3.0.3
152
138
# via playwright
153
- # via sqlalchemy
154
139
groq==0.8.0
155
140
# via langchain-groq
156
141
grpcio==1.64.0
@@ -388,6 +373,8 @@ rsa==4.9
388
373
# via google-auth
389
374
s3transfer==0.10.1
390
375
# via boto3
376
+ semchunk==1.0.1
377
+ # via scrapegraphai
391
378
sf-hamilton==1.63.0
392
379
# via burr
393
380
shellingham==1.5.4
@@ -443,8 +430,6 @@ tokenizers==0.19.1
443
430
# via anthropic
444
431
toml==0.10.2
445
432
# via streamlit
446
- tomli==2.0.1
447
- # via pytest
448
433
toolz==0.12.1
449
434
# via altair
450
435
tornado==6.4
@@ -454,12 +439,11 @@ tqdm==4.66.4
454
439
# via huggingface-hub
455
440
# via openai
456
441
# via scrapegraphai
442
+ # via semchunk
457
443
typer==0.12.3
458
444
# via fastapi-cli
459
445
typing-extensions==4.12.0
460
- # via altair
461
446
# via anthropic
462
- # via anyio
463
447
# via fastapi
464
448
# via fastapi-pagination
465
449
# via google-generativeai
@@ -474,7 +458,6 @@ typing-extensions==4.12.0
474
458
# via streamlit
475
459
# via typer
476
460
# via typing-inspect
477
- # via uvicorn
478
461
typing-inspect==0.9.0
479
462
# via dataclasses-json
480
463
# via sf-hamilton
@@ -492,13 +475,11 @@ urllib3==1.26.18
492
475
uvicorn==0.29.0
493
476
# via burr
494
477
# via fastapi
495
- watchdog==4.0.1
496
- # via streamlit
478
+ uvloop==0.19.0
479
+ # via uvicorn
497
480
watchfiles==0.21.0
498
481
# via uvicorn
499
482
websockets==12.0
500
483
# via uvicorn
501
- win32-setctime==1.1.0
502
- # via loguru
503
484
yarl==1.9.4
504
485
# via aiohttp
Original file line number Diff line number Diff line change @@ -22,9 +22,6 @@ anyio==4.3.0
22
22
# via groq
23
23
# via httpx
24
24
# via openai
25
- async-timeout==4.0.3
26
- # via aiohttp
27
- # via langchain
28
25
attrs==23.2.0
29
26
# via aiohttp
30
27
beautifulsoup4==4.12.3
@@ -43,8 +40,6 @@ certifi==2024.2.2
43
40
# via requests
44
41
charset-normalizer==3.3.2
45
42
# via requests
46
- colorama==0.4.6
47
- # via tqdm
48
43
dataclasses-json==0.6.6
49
44
# via langchain
50
45
# via langchain-community
@@ -54,8 +49,6 @@ distro==1.9.0
54
49
# via anthropic
55
50
# via groq
56
51
# via openai
57
- exceptiongroup==1.2.1
58
- # via anyio
59
52
faiss-cpu==1.8.0
60
53
# via scrapegraphai
61
54
filelock==3.14.0
@@ -94,7 +87,6 @@ graphviz==0.20.3
94
87
# via scrapegraphai
95
88
greenlet==3.0.3
96
89
# via playwright
97
- # via sqlalchemy
98
90
groq==0.8.0
99
91
# via langchain-groq
100
92
grpcio==1.64.0
@@ -246,6 +238,8 @@ rsa==4.9
246
238
# via google-auth
247
239
s3transfer==0.10.1
248
240
# via boto3
241
+ semchunk==1.0.1
242
+ # via scrapegraphai
249
243
six==1.16.0
250
244
# via python-dateutil
251
245
sniffio==1.3.1
@@ -273,9 +267,9 @@ tqdm==4.66.4
273
267
# via huggingface-hub
274
268
# via openai
275
269
# via scrapegraphai
270
+ # via semchunk
276
271
typing-extensions==4.12.0
277
272
# via anthropic
278
- # via anyio
279
273
# via google-generativeai
280
274
# via groq
281
275
# via huggingface-hub
Original file line number Diff line number Diff line change @@ -18,3 +18,4 @@ playwright==1.43.0
18
18
langchain-aws == 0.1.2
19
19
yahoo-search-py == 0.3
20
20
undetected-playwright == 0.3.0
21
+ semchunk == 1.0.1
Original file line number Diff line number Diff line change 3
3
"""
4
4
5
5
from typing import List , Optional
6
-
7
- from langchain .text_splitter import RecursiveCharacterTextSplitter
6
+ from semchunk import chunk
8
7
from langchain_community .document_transformers import Html2TextTransformer
9
8
from ..utils .logging import get_logger
10
9
from .base_node import BaseNode
@@ -67,20 +66,16 @@ def execute(self, state: dict) -> dict:
67
66
68
67
# Fetching data from the state based on the input keys
69
68
input_data = [state [key ] for key in input_keys ]
70
-
71
- text_splitter = RecursiveCharacterTextSplitter .from_tiktoken_encoder (
72
- chunk_size = self .node_config .get ("chunk_size" , 4096 ),
73
- chunk_overlap = 0 ,
74
- )
75
-
76
69
# Parse the document
77
70
docs_transformed = input_data [0 ]
78
71
if self .parse_html :
79
72
docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
80
73
docs_transformed = docs_transformed [0 ]
81
74
82
- chunks = text_splitter .split_text (docs_transformed .page_content )
83
-
75
+ chunks = chunk (text = docs_transformed .page_content ,
76
+ chunk_size = self .node_config .get ("chunk_size" , 4096 ),
77
+ token_counter = lambda x : len (x .split ()),
78
+ memoize = False )
84
79
state .update ({self .output [0 ]: chunks })
85
80
86
81
return state
You can’t perform that action at this time.
0 commit comments