|
1 | 1 | """
|
2 |
| -Research_web module |
| 2 | +research_web module |
3 | 3 | """
|
4 | 4 | import re
|
5 | 5 | from typing import List
|
|
8 | 8 | import requests
|
9 | 9 | from bs4 import BeautifulSoup
|
10 | 10 |
|
11 |
| -def search_on_web(query: str, search_engine: str = "Google", |
12 |
| - max_results: int = 10, port: int = 8080) -> List[str]: |
| 11 | +def search_on_web(query: str, search_engine: str = "Google", |
| 12 | + max_results: int = 10, port: int = 8080, |
| 13 | + timeout: int = 10) -> List[str]: |
13 | 14 | """
|
14 |
| - Searches the web for a given query using specified search engine options. |
| 15 | + Searches the web for a given query using specified search |
| 16 | + engine options and filters out PDF links. |
15 | 17 |
|
16 | 18 | Args:
|
17 | 19 | query (str): The search query to find on the internet.
|
18 | 20 | search_engine (str, optional): Specifies the search engine to use,
|
19 | 21 | options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
|
20 | 22 | max_results (int, optional): The maximum number of search results to return.
|
21 | 23 | port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
|
| 24 | + timeout (int, optional): The number of seconds to wait |
| 25 | + for a response from a request. Default is 10 seconds. |
22 | 26 |
|
23 | 27 | Returns:
|
24 |
| - List[str]: A list of URLs as strings that are the search results. |
| 28 | + List[str]: A list of URLs as strings that are the search results, excluding any PDF links. |
25 | 29 |
|
26 | 30 | Raises:
|
27 | 31 | ValueError: If the search engine specified is not supported.
|
| 32 | + requests.exceptions.Timeout: If the request times out. |
28 | 33 |
|
29 | 34 | Example:
|
30 | 35 | >>> search_on_web("example query", search_engine="Google", max_results=5)
|
31 | 36 | ['http://example.com', 'http://example.org', ...]
|
32 | 37 | """
|
33 | 38 |
|
| 39 | + def filter_pdf_links(links: List[str]) -> List[str]: |
| 40 | + """ |
| 41 | + Filters out any links that point to PDF files. |
| 42 | +
|
| 43 | + Args: |
| 44 | + links (List[str]): A list of URLs as strings. |
| 45 | +
|
| 46 | + Returns: |
| 47 | + List[str]: A list of URLs excluding any that end with '.pdf'. |
| 48 | + """ |
| 49 | + return [link for link in links if not link.lower().endswith('.pdf')] |
| 50 | + |
34 | 51 | if search_engine.lower() == "google":
|
35 | 52 | res = []
|
36 | 53 | for url in google_search(query, stop=max_results):
|
37 | 54 | res.append(url)
|
38 |
| - return res |
| 55 | + return filter_pdf_links(res) |
39 | 56 |
|
40 | 57 | elif search_engine.lower() == "duckduckgo":
|
41 | 58 | research = DuckDuckGoSearchResults(max_results=max_results)
|
42 | 59 | res = research.run(query)
|
43 | 60 | links = re.findall(r'https?://[^\s,\]]+', res)
|
44 |
| - return links |
| 61 | + return filter_pdf_links(links) |
45 | 62 |
|
46 | 63 | elif search_engine.lower() == "bing":
|
47 | 64 | headers = {
|
48 | 65 | "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64)
|
49 | 66 | AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"""
|
50 | 67 | }
|
51 | 68 | search_url = f"https://www.bing.com/search?q={query}"
|
52 |
| - response = requests.get(search_url, headers=headers) |
| 69 | + response = requests.get(search_url, headers=headers, timeout=timeout) |
53 | 70 | response.raise_for_status()
|
54 | 71 | soup = BeautifulSoup(response.text, "html.parser")
|
55 | 72 |
|
56 | 73 | search_results = []
|
57 | 74 | for result in soup.find_all('li', class_='b_algo', limit=max_results):
|
58 | 75 | link = result.find('a')['href']
|
59 | 76 | search_results.append(link)
|
60 |
| - return search_results |
| 77 | + return filter_pdf_links(search_results) |
61 | 78 |
|
62 | 79 | elif search_engine.lower() == "searxng":
|
63 | 80 | url = f"http://localhost:{port}"
|
64 |
| - params = {"q": query, |
65 |
| - "format": "json", |
66 |
| - "engines": "google,duckduckgo,brave,qwant,bing"} |
67 |
| - |
68 |
| - response = requests.get(url, params=params) |
69 |
| - |
| 81 | + params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"} |
| 82 | + response = requests.get(url, params=params, timeout=timeout) |
70 | 83 | data = response.json()
|
71 |
| - limited_results = data["results"][:max_results] |
72 |
| - return limited_results |
| 84 | + limited_results = [result['url'] for result in data["results"][:max_results]] |
| 85 | + return filter_pdf_links(limited_results) |
73 | 86 |
|
74 | 87 | else:
|
75 |
| - raise ValueError("""The only search engines available are |
| 88 | + raise ValueError("""The only search engines available are |
76 | 89 | DuckDuckGo, Google, Bing, or SearXNG""")
|
0 commit comments