Update parse_node.py

VinciGit00 · VinciGit00 · commit fc738cacace8 · 2024-09-08T11:54:11.000+02:00
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -3,14 +3,13 @@
 """
 from typing import Tuple, List, Optional
 from urllib.parse import urljoin
+import re
 from semchunk import chunk
 from langchain_community.document_transformers import Html2TextTransformer
 from langchain_core.documents import Document
 from .base_node import BaseNode
 from ..helpers import default_filters
 
-import re
-
 class ParseNode(BaseNode):
     """
     A node responsible for parsing HTML content from a document.
@@ -61,14 +60,12 @@ def _clean_urls(self, urls: List[str]) -> List[str]:
         """
         cleaned_urls = []
         for url in urls:
-            # Remove any leading 'thumbnail](' or similar patterns
             url = re.sub(r'.*?\]\(', '', url)
-            
-            # Remove any trailing parentheses or brackets
+
             url = url.rstrip(').')
-            
+
             cleaned_urls.append(url)
-        
+
         return cleaned_urls
 
     def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
@@ -81,26 +78,21 @@ def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
         Returns:
             Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
         """
-        # Return empty lists if the URLs are not to be parsed
         if not self.parse_urls:
             return [], []
-        
-        # Regular expression to find URLs (both links and images)
+
         image_extensions = default_filters.filter_dict["img_exts"]
         image_extension_seq = '|'.join(image_extensions).replace('.','')
         url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
 
-        # Find all URLs in the string
         all_urls = url_pattern.findall(text)
         all_urls = self._clean_urls(all_urls)
 
         if not source.startswith("http"):
-            # Remove any URLs that is not complete
             all_urls = [url for url in all_urls if url.startswith("http")]
         else:
-            # Add to local URLs the source URL
             all_urls = [urljoin(source, url) for url in all_urls]
-        
+
         images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
         links = [url for url in all_urls if url not in images]
 
@@ -136,7 +128,7 @@ def count_tokens(text):
             return token_count(text, self.llm_model.model_name)
 
         if self.parse_html:
-            docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
+            docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
             docs_transformed = docs_transformed[0]
 
             link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)