|
9 | 9 | from bs4 import BeautifulSoup
|
10 | 10 |
|
11 | 11 | def search_on_web(query: str, search_engine: str = "Google",
|
12 |
| - max_results: int = 10, port: int = 8080, |
| 12 | + max_results: int = 10, port: int = 8080, |
13 | 13 | timeout: int = 10, proxy: str | dict = None) -> List[str]:
|
| 14 | + """Search web function with improved error handling and validation""" |
| 15 | + |
| 16 | + # Input validation |
| 17 | + if not query or not isinstance(query, str): |
| 18 | + raise ValueError("Query must be a non-empty string") |
| 19 | + |
| 20 | + search_engine = search_engine.lower() |
| 21 | + valid_engines = {"google", "duckduckgo", "bing", "searxng"} |
| 22 | + if search_engine not in valid_engines: |
| 23 | + raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") |
| 24 | + |
| 25 | + # Format proxy once |
| 26 | + formatted_proxy = None |
| 27 | + if proxy: |
| 28 | + formatted_proxy = format_proxy(proxy) |
| 29 | + |
| 30 | + try: |
| 31 | + results = [] |
| 32 | + if search_engine == "google": |
| 33 | + results = list(google_search(query, num_results=max_results, proxy=formatted_proxy)) |
| 34 | + |
| 35 | + elif search_engine == "duckduckgo": |
| 36 | + research = DuckDuckGoSearchResults(max_results=max_results) |
| 37 | + res = research.run(query) |
| 38 | + results = re.findall(r'https?://[^\s,\]]+', res) |
| 39 | + |
| 40 | + elif search_engine == "bing": |
| 41 | + results = _search_bing(query, max_results, timeout, formatted_proxy) |
| 42 | + |
| 43 | + elif search_engine == "searxng": |
| 44 | + results = _search_searxng(query, max_results, port, timeout) |
| 45 | + |
| 46 | + return filter_pdf_links(results) |
| 47 | + |
| 48 | + except requests.Timeout: |
| 49 | + raise TimeoutError(f"Search request timed out after {timeout} seconds") |
| 50 | + except requests.RequestException as e: |
| 51 | + raise RuntimeError(f"Search request failed: {str(e)}") |
| 52 | + |
| 53 | +def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]: |
| 54 | + """Helper function for Bing search""" |
| 55 | + headers = { |
| 56 | + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" |
| 57 | + } |
| 58 | + search_url = f"https://www.bing.com/search?q={query}" |
| 59 | + |
| 60 | + proxies = {"http": proxy, "https": proxy} if proxy else None |
| 61 | + response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies) |
| 62 | + response.raise_for_status() |
| 63 | + |
| 64 | + soup = BeautifulSoup(response.text, "html.parser") |
| 65 | + return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)] |
| 66 | + |
| 67 | +def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]: |
| 68 | + """Helper function for SearXNG search""" |
| 69 | + url = f"http://localhost:{port}" |
| 70 | + params = { |
| 71 | + "q": query, |
| 72 | + "format": "json", |
| 73 | + "engines": "google,duckduckgo,brave,qwant,bing" |
| 74 | + } |
| 75 | + response = requests.get(url, params=params, timeout=timeout) |
| 76 | + response.raise_for_status() |
| 77 | + return [result['url'] for result in response.json().get("results", [])[:max_results]] |
| 78 | + |
| 79 | +def format_proxy(proxy): |
| 80 | + if isinstance(proxy, dict): |
| 81 | + server = proxy.get('server') |
| 82 | + username = proxy.get('username') |
| 83 | + password = proxy.get('password') |
| 84 | + |
| 85 | + if all([username, password, server]): |
| 86 | + proxy_url = f"http://{username}:{password}@{server}" |
| 87 | + return proxy_url |
| 88 | + else: |
| 89 | + raise ValueError("Proxy dictionary is missing required fields.") |
| 90 | + elif isinstance(proxy, str): |
| 91 | + return proxy # "https://username:password@ip:port" |
| 92 | + else: |
| 93 | + raise TypeError("Proxy should be a dictionary or a string.") |
| 94 | + |
| 95 | +def filter_pdf_links(links: List[str]) -> List[str]: |
14 | 96 | """
|
15 |
| - Searches the web for a given query using specified search |
16 |
| - engine options and filters out PDF links. |
| 97 | + Filters out any links that point to PDF files. |
17 | 98 |
|
18 | 99 | Args:
|
19 |
| - query (str): The search query to find on the internet. |
20 |
| - search_engine (str, optional): Specifies the search engine to use, |
21 |
| - options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. |
22 |
| - max_results (int, optional): The maximum number of search results to return. |
23 |
| - port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. |
24 |
| - timeout (int, optional): The number of seconds to wait |
25 |
| - for a response from a request. Default is 10 seconds. |
26 |
| - proxy (dict or string, optional): The proxy server to use for the request. Default is None. |
| 100 | + links (List[str]): A list of URLs as strings. |
27 | 101 |
|
28 | 102 | Returns:
|
29 |
| - List[str]: A list of URLs as strings that are the search results, excluding any PDF links. |
30 |
| -
|
31 |
| - Raises: |
32 |
| - ValueError: If the search engine specified is not supported. |
33 |
| - requests.exceptions.Timeout: If the request times out. |
34 |
| -
|
35 |
| - Example: |
36 |
| - >>> search_on_web("example query", search_engine="Google", max_results=5) |
37 |
| - ['http://example.com', 'http://example.org', ...] |
| 103 | + List[str]: A list of URLs excluding any that end with '.pdf'. |
38 | 104 | """
|
39 |
| - |
40 |
| - def format_proxy(proxy): |
41 |
| - if isinstance(proxy, dict): |
42 |
| - server = proxy.get('server') |
43 |
| - username = proxy.get('username') |
44 |
| - password = proxy.get('password') |
45 |
| - |
46 |
| - if all([username, password, server]): |
47 |
| - proxy_url = f"http://{username}:{password}@{server}" |
48 |
| - return proxy_url |
49 |
| - else: |
50 |
| - raise ValueError("Proxy dictionary is missing required fields.") |
51 |
| - elif isinstance(proxy, str): |
52 |
| - return proxy # "https://username:password@ip:port" |
53 |
| - else: |
54 |
| - raise TypeError("Proxy should be a dictionary or a string.") |
55 |
| - |
56 |
| - def filter_pdf_links(links: List[str]) -> List[str]: |
57 |
| - """ |
58 |
| - Filters out any links that point to PDF files. |
59 |
| -
|
60 |
| - Args: |
61 |
| - links (List[str]): A list of URLs as strings. |
62 |
| -
|
63 |
| - Returns: |
64 |
| - List[str]: A list of URLs excluding any that end with '.pdf'. |
65 |
| - """ |
66 |
| - return [link for link in links if not link.lower().endswith('.pdf')] |
67 |
| - |
68 |
| - if proxy: |
69 |
| - proxy = format_proxy(proxy) |
70 |
| - |
71 |
| - if search_engine.lower() == "google": |
72 |
| - res = [] |
73 |
| - for url in google_search(query, num_results=max_results, proxy=proxy): |
74 |
| - res.append(url) |
75 |
| - return filter_pdf_links(res) |
76 |
| - |
77 |
| - elif search_engine.lower() == "duckduckgo": |
78 |
| - research = DuckDuckGoSearchResults(max_results=max_results) |
79 |
| - res = research.run(query) |
80 |
| - links = re.findall(r'https?://[^\s,\]]+', res) |
81 |
| - return filter_pdf_links(links) |
82 |
| - |
83 |
| - elif search_engine.lower() == "bing": |
84 |
| - headers = { |
85 |
| - "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64) |
86 |
| - AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36""" |
87 |
| - } |
88 |
| - search_url = f"https://www.bing.com/search?q={query}" |
89 |
| - response = requests.get(search_url, headers=headers, timeout=timeout) |
90 |
| - response.raise_for_status() |
91 |
| - soup = BeautifulSoup(response.text, "html.parser") |
92 |
| - |
93 |
| - search_results = [] |
94 |
| - for result in soup.find_all('li', class_='b_algo', limit=max_results): |
95 |
| - link = result.find('a')['href'] |
96 |
| - search_results.append(link) |
97 |
| - return filter_pdf_links(search_results) |
98 |
| - |
99 |
| - elif search_engine.lower() == "searxng": |
100 |
| - url = f"http://localhost:{port}" |
101 |
| - params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"} |
102 |
| - response = requests.get(url, params=params, timeout=timeout) |
103 |
| - data = response.json() |
104 |
| - limited_results = [result['url'] for result in data["results"][:max_results]] |
105 |
| - return filter_pdf_links(limited_results) |
106 |
| - |
107 |
| - else: |
108 |
| - raise ValueError("""The only search engines available are |
109 |
| - DuckDuckGo, Google, Bing, or SearXNG""") |
| 105 | + return [link for link in links if not link.lower().endswith('.pdf')] |
0 commit comments