ScrapeGraphAI · VinciGit00 · Jun 4, 2024 · Jun 4, 2024
diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
@@ -35,11 +35,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
         tag.extract()
 
     # Links extraction
-    links = soup.find_all('a')
-    link_urls = []
-    for link in links:
-        if 'href' in link.attrs:
-            link_urls.append(urljoin(base_url, link['href']))
+    link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
 
     # Images extraction
     images = soup.find_all('img')
@@ -62,4 +58,4 @@ def cleanup_html(html_content: str, base_url: str) -> str:
         # return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
 
     # throw an error if no body content is found
-    raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
+    raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")