Merge pull request #715 from ScrapeGraphAI/713-pdf-scrapping

VinciGit00 · web-flow · commit a84176f97ed2 · 2024-10-01T10:26:35.000+02:00
713 pdf scrapping
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,14 +1,17 @@
+
 ## [1.26.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.26.0-beta.1) (2024-09-29)
 
 
-### Features
 
 * add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81))
 * add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24))
 
 
+
 ### Bug Fixes
 
+* removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279))
+
 * integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04))
 * removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279))
 
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
@@ -1,5 +1,5 @@
 """
-Research_web module
+research_web module
 """
 import re
 from typing import List
@@ -8,69 +8,82 @@
 import requests
 from bs4 import BeautifulSoup
 
-def search_on_web(query: str, search_engine: str = "Google", 
-                  max_results: int = 10, port: int = 8080) -> List[str]:
+def search_on_web(query: str, search_engine: str = "Google",
+                  max_results: int = 10, port: int = 8080, 
+                  timeout: int = 10) -> List[str]:
     """
-    Searches the web for a given query using specified search engine options.
+    Searches the web for a given query using specified search
+    engine options and filters out PDF links.
 
     Args:
         query (str): The search query to find on the internet.
         search_engine (str, optional): Specifies the search engine to use, 
         options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
         max_results (int, optional): The maximum number of search results to return.
         port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
+        timeout (int, optional): The number of seconds to wait 
+        for a response from a request. Default is 10 seconds.
 
     Returns:
-        List[str]: A list of URLs as strings that are the search results.
+        List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
 
     Raises:
         ValueError: If the search engine specified is not supported.
+        requests.exceptions.Timeout: If the request times out.
 
     Example:
         >>> search_on_web("example query", search_engine="Google", max_results=5)
         ['http://example.com', 'http://example.org', ...]
     """
 
+    def filter_pdf_links(links: List[str]) -> List[str]:
+        """
+        Filters out any links that point to PDF files.
+
+        Args:
+            links (List[str]): A list of URLs as strings.
+
+        Returns:
+            List[str]: A list of URLs excluding any that end with '.pdf'.
+        """
+        return [link for link in links if not link.lower().endswith('.pdf')]
+
     if search_engine.lower() == "google":
         res = []
         for url in google_search(query, stop=max_results):
             res.append(url)
-        return res
+        return filter_pdf_links(res)
 
     elif search_engine.lower() == "duckduckgo":
         research = DuckDuckGoSearchResults(max_results=max_results)
         res = research.run(query)
         links = re.findall(r'https?://[^\s,\]]+', res)
-        return links
+        return filter_pdf_links(links)
 
     elif search_engine.lower() == "bing":
         headers = {
             "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
             AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"""
         }
         search_url = f"https://www.bing.com/search?q={query}"
-        response = requests.get(search_url, headers=headers)
+        response = requests.get(search_url, headers=headers, timeout=timeout)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, "html.parser")
 
         search_results = []
         for result in soup.find_all('li', class_='b_algo', limit=max_results):
             link = result.find('a')['href']
             search_results.append(link)
-        return search_results
+        return filter_pdf_links(search_results)
 
     elif search_engine.lower() == "searxng":
         url = f"http://localhost:{port}"
-        params = {"q": query,
-                  "format": "json",
-                  "engines": "google,duckduckgo,brave,qwant,bing"}
-
-        response = requests.get(url, params=params)
-
+        params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"}
+        response = requests.get(url, params=params, timeout=timeout)
         data = response.json()
-        limited_results = data["results"][:max_results]
-        return limited_results
+        limited_results = [result['url'] for result in data["results"][:max_results]]
+        return filter_pdf_links(limited_results)
 
     else:
-        raise ValueError("""The only search engines available are 
+        raise ValueError("""The only search engines available are
                          DuckDuckGo, Google, Bing, or SearXNG""")