Skip to content

Commit 893aadd

Browse files
authored
Merge pull request #359 from VinciGit00/semchunk_integration
feat: add new chunking function
2 parents 589da1d + e1f045b commit 893aadd

File tree

5 files changed

+16
-44
lines changed

5 files changed

+16
-44
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ dependencies = [
3232
"playwright==1.43.0",
3333
"google==3.0.0",
3434
"undetected-playwright==0.3.0",
35+
"semchunk==1.0.1",
3536
]
3637

3738
license = "MIT"
@@ -81,4 +82,4 @@ dev-dependencies = [
8182
"pytest-mock==3.14.0",
8283
"-e file:.[burr]",
8384
"-e file:.[docs]",
84-
]
85+
]

requirements-dev.lock

Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ anyio==4.3.0
3030
# via openai
3131
# via starlette
3232
# via watchfiles
33-
async-timeout==4.0.3
34-
# via aiohttp
35-
# via langchain
3633
attrs==23.2.0
3734
# via aiohttp
3835
# via jsonschema
@@ -51,7 +48,6 @@ botocore==1.34.113
5148
# via boto3
5249
# via s3transfer
5350
burr==0.19.1
54-
# via burr
5551
# via scrapegraphai
5652
cachetools==5.3.3
5753
# via google-auth
@@ -67,13 +63,6 @@ click==8.1.7
6763
# via streamlit
6864
# via typer
6965
# via uvicorn
70-
colorama==0.4.6
71-
# via click
72-
# via loguru
73-
# via pytest
74-
# via sphinx
75-
# via tqdm
76-
# via uvicorn
7766
contourpy==1.2.1
7867
# via matplotlib
7968
cycler==0.12.1
@@ -93,9 +82,6 @@ docutils==0.19
9382
# via sphinx
9483
email-validator==2.1.1
9584
# via fastapi
96-
exceptiongroup==1.2.1
97-
# via anyio
98-
# via pytest
9985
faiss-cpu==1.8.0
10086
# via scrapegraphai
10187
fastapi==0.111.0
@@ -150,7 +136,6 @@ graphviz==0.20.3
150136
# via scrapegraphai
151137
greenlet==3.0.3
152138
# via playwright
153-
# via sqlalchemy
154139
groq==0.8.0
155140
# via langchain-groq
156141
grpcio==1.64.0
@@ -388,6 +373,8 @@ rsa==4.9
388373
# via google-auth
389374
s3transfer==0.10.1
390375
# via boto3
376+
semchunk==1.0.1
377+
# via scrapegraphai
391378
sf-hamilton==1.63.0
392379
# via burr
393380
shellingham==1.5.4
@@ -443,8 +430,6 @@ tokenizers==0.19.1
443430
# via anthropic
444431
toml==0.10.2
445432
# via streamlit
446-
tomli==2.0.1
447-
# via pytest
448433
toolz==0.12.1
449434
# via altair
450435
tornado==6.4
@@ -454,12 +439,11 @@ tqdm==4.66.4
454439
# via huggingface-hub
455440
# via openai
456441
# via scrapegraphai
442+
# via semchunk
457443
typer==0.12.3
458444
# via fastapi-cli
459445
typing-extensions==4.12.0
460-
# via altair
461446
# via anthropic
462-
# via anyio
463447
# via fastapi
464448
# via fastapi-pagination
465449
# via google-generativeai
@@ -474,7 +458,6 @@ typing-extensions==4.12.0
474458
# via streamlit
475459
# via typer
476460
# via typing-inspect
477-
# via uvicorn
478461
typing-inspect==0.9.0
479462
# via dataclasses-json
480463
# via sf-hamilton
@@ -492,13 +475,11 @@ urllib3==1.26.18
492475
uvicorn==0.29.0
493476
# via burr
494477
# via fastapi
495-
watchdog==4.0.1
496-
# via streamlit
478+
uvloop==0.19.0
479+
# via uvicorn
497480
watchfiles==0.21.0
498481
# via uvicorn
499482
websockets==12.0
500483
# via uvicorn
501-
win32-setctime==1.1.0
502-
# via loguru
503484
yarl==1.9.4
504485
# via aiohttp

requirements.lock

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ anyio==4.3.0
2222
# via groq
2323
# via httpx
2424
# via openai
25-
async-timeout==4.0.3
26-
# via aiohttp
27-
# via langchain
2825
attrs==23.2.0
2926
# via aiohttp
3027
beautifulsoup4==4.12.3
@@ -43,8 +40,6 @@ certifi==2024.2.2
4340
# via requests
4441
charset-normalizer==3.3.2
4542
# via requests
46-
colorama==0.4.6
47-
# via tqdm
4843
dataclasses-json==0.6.6
4944
# via langchain
5045
# via langchain-community
@@ -54,8 +49,6 @@ distro==1.9.0
5449
# via anthropic
5550
# via groq
5651
# via openai
57-
exceptiongroup==1.2.1
58-
# via anyio
5952
faiss-cpu==1.8.0
6053
# via scrapegraphai
6154
filelock==3.14.0
@@ -94,7 +87,6 @@ graphviz==0.20.3
9487
# via scrapegraphai
9588
greenlet==3.0.3
9689
# via playwright
97-
# via sqlalchemy
9890
groq==0.8.0
9991
# via langchain-groq
10092
grpcio==1.64.0
@@ -246,6 +238,8 @@ rsa==4.9
246238
# via google-auth
247239
s3transfer==0.10.1
248240
# via boto3
241+
semchunk==1.0.1
242+
# via scrapegraphai
249243
six==1.16.0
250244
# via python-dateutil
251245
sniffio==1.3.1
@@ -273,9 +267,9 @@ tqdm==4.66.4
273267
# via huggingface-hub
274268
# via openai
275269
# via scrapegraphai
270+
# via semchunk
276271
typing-extensions==4.12.0
277272
# via anthropic
278-
# via anyio
279273
# via google-generativeai
280274
# via groq
281275
# via huggingface-hub

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ playwright==1.43.0
1818
langchain-aws==0.1.2
1919
yahoo-search-py==0.3
2020
undetected-playwright==0.3.0
21+
semchunk==1.0.1

scrapegraphai/nodes/parse_node.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
"""
44

55
from typing import List, Optional
6-
7-
from langchain.text_splitter import RecursiveCharacterTextSplitter
6+
from semchunk import chunk
87
from langchain_community.document_transformers import Html2TextTransformer
98
from ..utils.logging import get_logger
109
from .base_node import BaseNode
@@ -67,20 +66,16 @@ def execute(self, state: dict) -> dict:
6766

6867
# Fetching data from the state based on the input keys
6968
input_data = [state[key] for key in input_keys]
70-
71-
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
72-
chunk_size=self.node_config.get("chunk_size", 4096),
73-
chunk_overlap=0,
74-
)
75-
7669
# Parse the document
7770
docs_transformed = input_data[0]
7871
if self.parse_html:
7972
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
8073
docs_transformed = docs_transformed[0]
8174

82-
chunks = text_splitter.split_text(docs_transformed.page_content)
83-
75+
chunks = chunk(text=docs_transformed.page_content,
76+
chunk_size= self.node_config.get("chunk_size", 4096),
77+
token_counter=lambda x: len(x.split()),
78+
memoize=False)
8479
state.update({self.output[0]: chunks})
8580

8681
return state

0 commit comments

Comments
 (0)