Skip to content

Commit 58cd523

Browse files
committed
Merge branch 'pre/beta' of https://github.com/VinciGit00/Scrapegraph-ai into pre/beta
2 parents f81442b + 4c0d0e9 commit 58cd523

File tree

1 file changed

+2
-6
lines changed

1 file changed

+2
-6
lines changed

scrapegraphai/utils/cleanup_html.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
3535
tag.extract()
3636

3737
# Links extraction
38-
links = soup.find_all('a')
39-
link_urls = []
40-
for link in links:
41-
if 'href' in link.attrs:
42-
link_urls.append(urljoin(base_url, link['href']))
38+
link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
4339

4440
# Images extraction
4541
images = soup.find_all('img')
@@ -62,4 +58,4 @@ def cleanup_html(html_content: str, base_url: str) -> str:
6258
# return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
6359

6460
# throw an error if no body content is found
65-
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
61+
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")

0 commit comments

Comments
 (0)