Skip to content

feat: add new chunking function #359

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dependencies = [
"playwright==1.43.0",
"google==3.0.0",
"undetected-playwright==0.3.0",
"semchunk==1.0.1",
]

license = "MIT"
Expand Down Expand Up @@ -80,4 +81,4 @@ dev-dependencies = [
"pytest-mock==3.14.0",
"-e file:.[burr]",
"-e file:.[docs]",
]
]
29 changes: 5 additions & 24 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@ anyio==4.3.0
# via openai
# via starlette
# via watchfiles
async-timeout==4.0.3
# via aiohttp
# via langchain
attrs==23.2.0
# via aiohttp
# via jsonschema
Expand All @@ -51,7 +48,6 @@ botocore==1.34.113
# via boto3
# via s3transfer
burr==0.19.1
# via burr
# via scrapegraphai
cachetools==5.3.3
# via google-auth
Expand All @@ -67,13 +63,6 @@ click==8.1.7
# via streamlit
# via typer
# via uvicorn
colorama==0.4.6
# via click
# via loguru
# via pytest
# via sphinx
# via tqdm
# via uvicorn
contourpy==1.2.1
# via matplotlib
cycler==0.12.1
Expand All @@ -93,9 +82,6 @@ docutils==0.19
# via sphinx
email-validator==2.1.1
# via fastapi
exceptiongroup==1.2.1
# via anyio
# via pytest
faiss-cpu==1.8.0
# via scrapegraphai
fastapi==0.111.0
Expand Down Expand Up @@ -150,7 +136,6 @@ graphviz==0.20.3
# via scrapegraphai
greenlet==3.0.3
# via playwright
# via sqlalchemy
groq==0.8.0
# via langchain-groq
grpcio==1.64.0
Expand Down Expand Up @@ -388,6 +373,8 @@ rsa==4.9
# via google-auth
s3transfer==0.10.1
# via boto3
semchunk==1.0.1
# via scrapegraphai
sf-hamilton==1.63.0
# via burr
shellingham==1.5.4
Expand Down Expand Up @@ -443,8 +430,6 @@ tokenizers==0.19.1
# via anthropic
toml==0.10.2
# via streamlit
tomli==2.0.1
# via pytest
toolz==0.12.1
# via altair
tornado==6.4
Expand All @@ -454,12 +439,11 @@ tqdm==4.66.4
# via huggingface-hub
# via openai
# via scrapegraphai
# via semchunk
typer==0.12.3
# via fastapi-cli
typing-extensions==4.12.0
# via altair
# via anthropic
# via anyio
# via fastapi
# via fastapi-pagination
# via google-generativeai
Expand All @@ -474,7 +458,6 @@ typing-extensions==4.12.0
# via streamlit
# via typer
# via typing-inspect
# via uvicorn
typing-inspect==0.9.0
# via dataclasses-json
# via sf-hamilton
Expand All @@ -492,13 +475,11 @@ urllib3==1.26.18
uvicorn==0.29.0
# via burr
# via fastapi
watchdog==4.0.1
# via streamlit
uvloop==0.19.0
# via uvicorn
watchfiles==0.21.0
# via uvicorn
websockets==12.0
# via uvicorn
win32-setctime==1.1.0
# via loguru
yarl==1.9.4
# via aiohttp
12 changes: 3 additions & 9 deletions requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ anyio==4.3.0
# via groq
# via httpx
# via openai
async-timeout==4.0.3
# via aiohttp
# via langchain
attrs==23.2.0
# via aiohttp
beautifulsoup4==4.12.3
Expand All @@ -43,8 +40,6 @@ certifi==2024.2.2
# via requests
charset-normalizer==3.3.2
# via requests
colorama==0.4.6
# via tqdm
dataclasses-json==0.6.6
# via langchain
# via langchain-community
Expand All @@ -54,8 +49,6 @@ distro==1.9.0
# via anthropic
# via groq
# via openai
exceptiongroup==1.2.1
# via anyio
faiss-cpu==1.8.0
# via scrapegraphai
filelock==3.14.0
Expand Down Expand Up @@ -94,7 +87,6 @@ graphviz==0.20.3
# via scrapegraphai
greenlet==3.0.3
# via playwright
# via sqlalchemy
groq==0.8.0
# via langchain-groq
grpcio==1.64.0
Expand Down Expand Up @@ -246,6 +238,8 @@ rsa==4.9
# via google-auth
s3transfer==0.10.1
# via boto3
semchunk==1.0.1
# via scrapegraphai
six==1.16.0
# via python-dateutil
sniffio==1.3.1
Expand Down Expand Up @@ -273,9 +267,9 @@ tqdm==4.66.4
# via huggingface-hub
# via openai
# via scrapegraphai
# via semchunk
typing-extensions==4.12.0
# via anthropic
# via anyio
# via google-generativeai
# via groq
# via huggingface-hub
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ playwright==1.43.0
langchain-aws==0.1.2
yahoo-search-py==0.3
undetected-playwright==0.3.0
semchunk==1.0.1
15 changes: 5 additions & 10 deletions scrapegraphai/nodes/parse_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
"""

from typing import List, Optional

from langchain.text_splitter import RecursiveCharacterTextSplitter
from semchunk import chunk
from langchain_community.document_transformers import Html2TextTransformer
from ..utils.logging import get_logger
from .base_node import BaseNode
Expand Down Expand Up @@ -67,20 +66,16 @@ def execute(self, state: dict) -> dict:

# Fetching data from the state based on the input keys
input_data = [state[key] for key in input_keys]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=self.node_config.get("chunk_size", 4096),
chunk_overlap=0,
)

# Parse the document
docs_transformed = input_data[0]
if self.parse_html:
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
docs_transformed = docs_transformed[0]

chunks = text_splitter.split_text(docs_transformed.page_content)

chunks = chunk(text=docs_transformed.page_content,
chunk_size= self.node_config.get("chunk_size", 4096),
token_counter=lambda x: len(x.split()),
memoize=False)
state.update({self.output[0]: chunks})

return state
Loading