Merge pull request #359 from VinciGit00/semchunk_integration

PeriniM · web-flow · commit 893aaddc5cd2 · 2024-06-11T23:16:00.000+02:00
feat: add new chunking function
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
     "playwright==1.43.0",
     "google==3.0.0",
     "undetected-playwright==0.3.0",
+    "semchunk==1.0.1",
 ]
 
 license = "MIT"
@@ -81,4 +82,4 @@ dev-dependencies = [
     "pytest-mock==3.14.0",
     "-e file:.[burr]",
     "-e file:.[docs]",
-]
+]
diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -30,9 +30,6 @@ anyio==4.3.0
     # via openai
     # via starlette
     # via watchfiles
-async-timeout==4.0.3
-    # via aiohttp
-    # via langchain
 attrs==23.2.0
     # via aiohttp
     # via jsonschema
@@ -51,7 +48,6 @@ botocore==1.34.113
     # via boto3
     # via s3transfer
 burr==0.19.1
-    # via burr
     # via scrapegraphai
 cachetools==5.3.3
     # via google-auth
@@ -67,13 +63,6 @@ click==8.1.7
     # via streamlit
     # via typer
     # via uvicorn
-colorama==0.4.6
-    # via click
-    # via loguru
-    # via pytest
-    # via sphinx
-    # via tqdm
-    # via uvicorn
 contourpy==1.2.1
     # via matplotlib
 cycler==0.12.1
@@ -93,9 +82,6 @@ docutils==0.19
     # via sphinx
 email-validator==2.1.1
     # via fastapi
-exceptiongroup==1.2.1
-    # via anyio
-    # via pytest
 faiss-cpu==1.8.0
     # via scrapegraphai
 fastapi==0.111.0
@@ -150,7 +136,6 @@ graphviz==0.20.3
     # via scrapegraphai
 greenlet==3.0.3
     # via playwright
-    # via sqlalchemy
 groq==0.8.0
     # via langchain-groq
 grpcio==1.64.0
@@ -388,6 +373,8 @@ rsa==4.9
     # via google-auth
 s3transfer==0.10.1
     # via boto3
+semchunk==1.0.1
+    # via scrapegraphai
 sf-hamilton==1.63.0
     # via burr
 shellingham==1.5.4
@@ -443,8 +430,6 @@ tokenizers==0.19.1
     # via anthropic
 toml==0.10.2
     # via streamlit
-tomli==2.0.1
-    # via pytest
 toolz==0.12.1
     # via altair
 tornado==6.4
@@ -454,12 +439,11 @@ tqdm==4.66.4
     # via huggingface-hub
     # via openai
     # via scrapegraphai
+    # via semchunk
 typer==0.12.3
     # via fastapi-cli
 typing-extensions==4.12.0
-    # via altair
     # via anthropic
-    # via anyio
     # via fastapi
     # via fastapi-pagination
     # via google-generativeai
@@ -474,7 +458,6 @@ typing-extensions==4.12.0
     # via streamlit
     # via typer
     # via typing-inspect
-    # via uvicorn
 typing-inspect==0.9.0
     # via dataclasses-json
     # via sf-hamilton
@@ -492,13 +475,11 @@ urllib3==1.26.18
 uvicorn==0.29.0
     # via burr
     # via fastapi
-watchdog==4.0.1
-    # via streamlit
+uvloop==0.19.0
+    # via uvicorn
 watchfiles==0.21.0
     # via uvicorn
 websockets==12.0
     # via uvicorn
-win32-setctime==1.1.0
-    # via loguru
 yarl==1.9.4
     # via aiohttp
diff --git a/requirements.lock b/requirements.lock
@@ -22,9 +22,6 @@ anyio==4.3.0
     # via groq
     # via httpx
     # via openai
-async-timeout==4.0.3
-    # via aiohttp
-    # via langchain
 attrs==23.2.0
     # via aiohttp
 beautifulsoup4==4.12.3
@@ -43,8 +40,6 @@ certifi==2024.2.2
     # via requests
 charset-normalizer==3.3.2
     # via requests
-colorama==0.4.6
-    # via tqdm
 dataclasses-json==0.6.6
     # via langchain
     # via langchain-community
@@ -54,8 +49,6 @@ distro==1.9.0
     # via anthropic
     # via groq
     # via openai
-exceptiongroup==1.2.1
-    # via anyio
 faiss-cpu==1.8.0
     # via scrapegraphai
 filelock==3.14.0
@@ -94,7 +87,6 @@ graphviz==0.20.3
     # via scrapegraphai
 greenlet==3.0.3
     # via playwright
-    # via sqlalchemy
 groq==0.8.0
     # via langchain-groq
 grpcio==1.64.0
@@ -246,6 +238,8 @@ rsa==4.9
     # via google-auth
 s3transfer==0.10.1
     # via boto3
+semchunk==1.0.1
+    # via scrapegraphai
 six==1.16.0
     # via python-dateutil
 sniffio==1.3.1
@@ -273,9 +267,9 @@ tqdm==4.66.4
     # via huggingface-hub
     # via openai
     # via scrapegraphai
+    # via semchunk
 typing-extensions==4.12.0
     # via anthropic
-    # via anyio
     # via google-generativeai
     # via groq
     # via huggingface-hub
diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,4 @@ playwright==1.43.0
 langchain-aws==0.1.2
 yahoo-search-py==0.3
 undetected-playwright==0.3.0
+semchunk==1.0.1
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -3,8 +3,7 @@
 """
 
 from typing import List, Optional
-
-from langchain.text_splitter import RecursiveCharacterTextSplitter
+from semchunk import chunk
 from langchain_community.document_transformers import Html2TextTransformer
 from ..utils.logging import get_logger
 from .base_node import BaseNode
@@ -67,20 +66,16 @@ def execute(self, state: dict) -> dict:
 
         # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
-
-        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-            chunk_size=self.node_config.get("chunk_size", 4096),
-            chunk_overlap=0,
-        )
-
         # Parse the document
         docs_transformed = input_data[0]
         if self.parse_html:
             docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
         docs_transformed = docs_transformed[0]
 
-        chunks = text_splitter.split_text(docs_transformed.page_content)
-
+        chunks = chunk(text=docs_transformed.page_content,
+                        chunk_size= self.node_config.get("chunk_size", 4096),
+                        token_counter=lambda x: len(x.split()),
+                        memoize=False)
         state.update({self.output[0]: chunks})
 
         return state