Skip to content

Commit 4e62689

Browse files
authored
Merge pull request #203 from mayurdb/fetchNodeFix
fix: Augment the information getting fetched from a webpage
2 parents 460d292 + 99adc97 commit 4e62689

File tree

4 files changed

+30
-9
lines changed

4 files changed

+30
-9
lines changed

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
## [0.10.0-beta.6](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.5...v0.10.0-beta.6) (2024-05-09)
2-
31

42
### Bug Fixes
53

@@ -8,8 +6,10 @@
86
## [0.10.0-beta.5](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.4...v0.10.0-beta.5) (2024-05-09)
97

108

9+
1110
### Bug Fixes
1211

12+
1313
* fixed bugs for csv and xml ([324e977](https://github.com/VinciGit00/Scrapegraph-ai/commit/324e977b853ecaa55bac4bf86e7cd927f7f43d0d))
1414

1515
## [0.10.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.3...v0.10.0-beta.4) (2024-05-09)

docs/source/getting_started/examples.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,12 @@ Local models
4444

4545
Remember to have installed in your pc ollama `ollama <https://ollama.com/>`
4646
Remember to pull the right model for LLM and for the embeddings, like:
47+
4748
.. code-block:: bash
4849
4950
ollama pull llama3
51+
ollama pull nomic-embed-text
52+
ollama pull mistral
5053
5154
After that, you can run the following code, using only your machine resources brum brum brum:
5255

scrapegraphai/nodes/fetch_node.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
from langchain_core.documents import Document
99
from langchain_community.document_loaders import PyPDFLoader
1010
from .base_node import BaseNode
11-
from ..utils.remover import remover
11+
from ..utils.cleanup_html import cleanup_html
12+
import requests
13+
from bs4 import BeautifulSoup
1214

1315

1416
class FetchNode(BaseNode):
@@ -34,6 +36,7 @@ class FetchNode(BaseNode):
3436
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
3537
super().__init__(node_name, "node", input, output, 1)
3638

39+
3740
self.headless = True if node_config is None else node_config.get(
3841
"headless", True)
3942
self.verbose = False if node_config is None else node_config.get(
@@ -94,10 +97,22 @@ def execute(self, state):
9497
pass
9598

9699
elif not source.startswith("http"):
97-
compressed_document = [Document(page_content=remover(source), metadata={
100+
compressed_document = [Document(page_content=cleanup_html(source), metadata={
98101
"source": "local_dir"
99102
})]
100103

104+
elif self.useSoup:
105+
response = requests.get(source)
106+
if response.status_code == 200:
107+
soup = BeautifulSoup(response.text, 'html.parser')
108+
links = soup.find_all('a')
109+
link_urls = []
110+
for link in links:
111+
if 'href' in link.attrs:
112+
link_urls.append(link['href'])
113+
compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
114+
else:
115+
print(f"Failed to retrieve contents from the webpage at url: {url}")
101116
else:
102117
if self.node_config is not None and self.node_config.get("endpoint") is not None:
103118

@@ -114,7 +129,7 @@ def execute(self, state):
114129

115130
document = loader.load()
116131
compressed_document = [
117-
Document(page_content=remover(str(document[0].page_content)))]
132+
Document(page_content=cleanup_html(str(document[0].page_content)))]
118133

119134
state.update({self.output[0]: compressed_document})
120135
return state

scrapegraphai/utils/remover.py renamed to scrapegraphai/utils/cleanup_html.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from minify_html import minify
66

77

8-
def remover(html_content: str) -> str:
8+
def cleanup_html(html_content: str, urls: list = []) -> str:
99
"""
1010
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
1111
@@ -17,7 +17,7 @@ def remover(html_content: str) -> str:
1717
1818
Example:
1919
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
20-
>>> remover(html_content)
20+
>>> cleanup_html(html_content)
2121
'Title: Example, Body: <body><p>Hello World!</p></body>'
2222
2323
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
@@ -35,9 +35,12 @@ def remover(html_content: str) -> str:
3535

3636
# Body Extraction (if it exists)
3737
body_content = soup.find('body')
38+
urls_content = ""
39+
if urls:
40+
urls_content = f", URLs in page: {urls}"
3841
if body_content:
3942
# Minify the HTML within the body tag
4043
minimized_body = minify(str(body_content))
41-
return "Title: " + title + ", Body: " + minimized_body
44+
return "Title: " + title + ", Body: " + minimized_body + urls_content
4245

43-
return "Title: " + title + ", Body: No body content found"
46+
return "Title: " + title + ", Body: No body content found" + urls_content

0 commit comments

Comments
 (0)