Skip to content

Commit b752499

Browse files
authored
Merge pull request #217 from mayurdb/fetchLinkFix
Fetch links in the page while parsing html
2 parents 1fa77e5 + 300fd5d commit b752499

File tree

3 files changed

+29
-8
lines changed

3 files changed

+29
-8
lines changed

scrapegraphai/nodes/fetch_node.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33
"""
44
import pandas as pd
55
import json
6+
import requests
67
from typing import List, Optional
78
from langchain_community.document_loaders import AsyncChromiumLoader
89
from langchain_core.documents import Document
910
from langchain_community.document_loaders import PyPDFLoader
1011
from .base_node import BaseNode
11-
from ..utils.remover import remover
12+
from ..utils.cleanup_html import cleanup_html
1213

1314

1415
class FetchNode(BaseNode):
@@ -38,6 +39,8 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict] =
3839
"headless", True)
3940
self.verbose = False if node_config is None else node_config.get(
4041
"verbose", False)
42+
self.useSoup = True if node_config is None else node_config.get(
43+
"useSoup", True)
4144

4245
def execute(self, state):
4346
"""
@@ -94,9 +97,17 @@ def execute(self, state):
9497
pass
9598

9699
elif not source.startswith("http"):
97-
compressed_document = [Document(page_content=remover(source), metadata={
100+
compressed_document = [Document(page_content=cleanup_html(source), metadata={
98101
"source": "local_dir"
99102
})]
103+
104+
elif self.useSoup:
105+
response = requests.get(source)
106+
if response.status_code == 200:
107+
cleanedup_html = cleanup_html(response.text, source)
108+
compressed_document = [Document(page_content=cleanedup_html)]
109+
else:
110+
print(f"Failed to retrieve contents from the webpage at url: {url}")
100111

101112
else:
102113
if self.node_config is not None and self.node_config.get("endpoint") is not None:
@@ -114,7 +125,7 @@ def execute(self, state):
114125

115126
document = loader.load()
116127
compressed_document = [
117-
Document(page_content=remover(str(document[0].page_content)))]
128+
Document(page_content=cleanup_html(str(document[0].page_content)))]
118129

119130
state.update({self.output[0]: compressed_document})
120131
return state

scrapegraphai/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
from .convert_to_json import convert_to_json
77
from .prettify_exec_info import prettify_exec_info
88
from .proxy_rotation import proxy_generator
9-
from .remover import remover
9+
from .cleanup_html import cleanup_html

scrapegraphai/utils/remover.py renamed to scrapegraphai/utils/cleanup_html.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
"""
44
from bs4 import BeautifulSoup
55
from minify_html import minify
6+
from urllib.parse import urljoin
67

7-
8-
def remover(html_content: str) -> str:
8+
def cleanup_html(html_content: str, base_url: str) -> str:
99
"""
1010
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
1111
@@ -33,11 +33,21 @@ def remover(html_content: str) -> str:
3333
for tag in soup.find_all(['script', 'style']):
3434
tag.extract()
3535

36+
# Links extraction
37+
links = soup.find_all('a')
38+
link_urls = []
39+
for link in links:
40+
if 'href' in link.attrs:
41+
link_urls.append(urljoin(base_url, link['href']))
42+
3643
# Body Extraction (if it exists)
3744
body_content = soup.find('body')
3845
if body_content:
3946
# Minify the HTML within the body tag
4047
minimized_body = minify(str(body_content))
41-
return "Title: " + title + ", Body: " + minimized_body
48+
print("Came here")
49+
return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls)
50+
4251

43-
return "Title: " + title + ", Body: No body content found"
52+
print("No Came here")
53+
return "Title: " + title + ", Body: No body content found" + ", Links: " + str(link_urls)

0 commit comments

Comments
 (0)