Skip to content

Commit fc738ca

Browse files
committed
Update parse_node.py
1 parent a73fec5 commit fc738ca

File tree

1 file changed

+7
-15
lines changed

1 file changed

+7
-15
lines changed

scrapegraphai/nodes/parse_node.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,13 @@
33
"""
44
from typing import Tuple, List, Optional
55
from urllib.parse import urljoin
6+
import re
67
from semchunk import chunk
78
from langchain_community.document_transformers import Html2TextTransformer
89
from langchain_core.documents import Document
910
from .base_node import BaseNode
1011
from ..helpers import default_filters
1112

12-
import re
13-
1413
class ParseNode(BaseNode):
1514
"""
1615
A node responsible for parsing HTML content from a document.
@@ -61,14 +60,12 @@ def _clean_urls(self, urls: List[str]) -> List[str]:
6160
"""
6261
cleaned_urls = []
6362
for url in urls:
64-
# Remove any leading 'thumbnail](' or similar patterns
6563
url = re.sub(r'.*?\]\(', '', url)
66-
67-
# Remove any trailing parentheses or brackets
64+
6865
url = url.rstrip(').')
69-
66+
7067
cleaned_urls.append(url)
71-
68+
7269
return cleaned_urls
7370

7471
def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
@@ -81,26 +78,21 @@ def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
8178
Returns:
8279
Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
8380
"""
84-
# Return empty lists if the URLs are not to be parsed
8581
if not self.parse_urls:
8682
return [], []
87-
88-
# Regular expression to find URLs (both links and images)
83+
8984
image_extensions = default_filters.filter_dict["img_exts"]
9085
image_extension_seq = '|'.join(image_extensions).replace('.','')
9186
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
9287

93-
# Find all URLs in the string
9488
all_urls = url_pattern.findall(text)
9589
all_urls = self._clean_urls(all_urls)
9690

9791
if not source.startswith("http"):
98-
# Remove any URLs that is not complete
9992
all_urls = [url for url in all_urls if url.startswith("http")]
10093
else:
101-
# Add to local URLs the source URL
10294
all_urls = [urljoin(source, url) for url in all_urls]
103-
95+
10496
images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
10597
links = [url for url in all_urls if url not in images]
10698

@@ -136,7 +128,7 @@ def count_tokens(text):
136128
return token_count(text, self.llm_model.model_name)
137129

138130
if self.parse_html:
139-
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
131+
docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
140132
docs_transformed = docs_transformed[0]
141133

142134
link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)

0 commit comments

Comments
 (0)