Skip to content

Commit f2bb22d

Browse files
committed
fix: temporary fix for parse_node
1 parent fc738ca commit f2bb22d

File tree

2 files changed

+6
-76
lines changed

2 files changed

+6
-76
lines changed

scrapegraphai/nodes/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@
2222
from .merge_generated_scripts import MergeGeneratedScriptsNode
2323
from .fetch_screen_node import FetchScreenNode
2424
from .generate_answer_from_image_node import GenerateAnswerFromImageNode
25-
from .concat_answers_node import ConcatAnswersNode
25+
from .concat_answers_node import ConcatAnswersNode

scrapegraphai/nodes/parse_node.py

Lines changed: 5 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
"""
22
ParseNode Module
33
"""
4-
from typing import Tuple, List, Optional
5-
from urllib.parse import urljoin
6-
import re
4+
from typing import List, Optional
75
from semchunk import chunk
86
from langchain_community.document_transformers import Html2TextTransformer
97
from langchain_core.documents import Document
108
from .base_node import BaseNode
11-
from ..helpers import default_filters
129

1310
class ParseNode(BaseNode):
1411
"""
@@ -43,60 +40,6 @@ def __init__(
4340
self.parse_html = (
4441
True if node_config is None else node_config.get("parse_html", True)
4542
)
46-
self.llm_model = node_config['llm_model']
47-
self.parse_urls = (
48-
False if node_config is None else node_config.get("parse_urls", False)
49-
)
50-
51-
def _clean_urls(self, urls: List[str]) -> List[str]:
52-
"""
53-
Cleans the URLs extracted from the text.
54-
55-
Args:
56-
urls (List[str]): The list of URLs to clean.
57-
58-
Returns:
59-
List[str]: The cleaned URLs.
60-
"""
61-
cleaned_urls = []
62-
for url in urls:
63-
url = re.sub(r'.*?\]\(', '', url)
64-
65-
url = url.rstrip(').')
66-
67-
cleaned_urls.append(url)
68-
69-
return cleaned_urls
70-
71-
def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
72-
"""
73-
Extracts URLs from the given text.
74-
75-
Args:
76-
text (str): The text to extract URLs from.
77-
78-
Returns:
79-
Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
80-
"""
81-
if not self.parse_urls:
82-
return [], []
83-
84-
image_extensions = default_filters.filter_dict["img_exts"]
85-
image_extension_seq = '|'.join(image_extensions).replace('.','')
86-
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
87-
88-
all_urls = url_pattern.findall(text)
89-
all_urls = self._clean_urls(all_urls)
90-
91-
if not source.startswith("http"):
92-
all_urls = [url for url in all_urls if url.startswith("http")]
93-
else:
94-
all_urls = [urljoin(source, url) for url in all_urls]
95-
96-
images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
97-
links = [url for url in all_urls if url not in images]
98-
99-
return links, images
10043

10144
def execute(self, state: dict) -> dict:
10245
"""
@@ -119,46 +62,33 @@ def execute(self, state: dict) -> dict:
11962
input_keys = self.get_input_keys(state)
12063

12164
input_data = [state[key] for key in input_keys]
122-
12365
docs_transformed = input_data[0]
124-
source = input_data[1] if self.parse_urls else None
125-
126-
def count_tokens(text):
127-
from ..utils import token_count
128-
return token_count(text, self.llm_model.model_name)
12966

13067
if self.parse_html:
131-
docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
68+
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
13269
docs_transformed = docs_transformed[0]
13370

134-
link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
135-
13671
chunks = chunk(text=docs_transformed.page_content,
13772
chunk_size=self.node_config.get("chunk_size", 4096)-250,
138-
token_counter=count_tokens,
73+
token_counter=lambda text: len(text.split()),
13974
memoize=False)
14075
else:
14176
docs_transformed = docs_transformed[0]
14277

143-
link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
144-
14578
chunk_size = self.node_config.get("chunk_size", 4096)
14679
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
14780

14881
if isinstance(docs_transformed, Document):
14982
chunks = chunk(text=docs_transformed.page_content,
15083
chunk_size=chunk_size,
151-
token_counter=count_tokens,
84+
token_counter=lambda text: len(text.split()),
15285
memoize=False)
15386
else:
15487
chunks = chunk(text=docs_transformed,
15588
chunk_size=chunk_size,
156-
token_counter=count_tokens,
89+
token_counter=lambda text: len(text.split()),
15790
memoize=False)
15891

15992
state.update({self.output[0]: chunks})
160-
if self.parse_urls:
161-
state.update({self.output[1]: link_urls})
162-
state.update({self.output[2]: img_urls})
16393

16494
return state

0 commit comments

Comments
 (0)