Skip to content

Commit 789c16f

Browse files
authored
Merge pull request #458 from ScrapeGraphAI/searchngx_integration
merge: update research_web.py
2 parents eaf3e98 + 7ba2f6a commit 789c16f

File tree

1 file changed

+23
-15
lines changed

1 file changed

+23
-15
lines changed

scrapegraphai/utils/research_web.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,43 @@
1-
"""
2-
research web module
3-
"""
41
import re
52
from typing import List
63
from langchain_community.tools import DuckDuckGoSearchResults
74
from googlesearch import search as google_search
85
import requests
96
from bs4 import BeautifulSoup
107

11-
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
8+
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]:
129
"""
1310
Searches the web for a given query using specified search engine options.
1411
1512
Args:
1613
query (str): The search query to find on the internet.
17-
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
14+
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
1815
max_results (int, optional): The maximum number of search results to return.
16+
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
1917
2018
Returns:
2119
List[str]: A list of URLs as strings that are the search results.
2220
2321
Raises:
24-
ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
22+
ValueError: If the search engine specified is not supported.
2523
2624
Example:
2725
>>> search_on_web("example query", search_engine="Google", max_results=5)
2826
['http://example.com', 'http://example.org', ...]
29-
30-
This function allows switching between Google, DuckDuckGo, and Bing to perform
31-
internet searches, returning a list of result URLs.
3227
"""
33-
28+
3429
if search_engine.lower() == "google":
3530
res = []
3631
for url in google_search(query, stop=max_results):
3732
res.append(url)
3833
return res
39-
34+
4035
elif search_engine.lower() == "duckduckgo":
4136
research = DuckDuckGoSearchResults(max_results=max_results)
4237
res = research.run(query)
4338
links = re.findall(r'https?://[^\s,\]]+', res)
4439
return links
45-
40+
4641
elif search_engine.lower() == "bing":
4742
headers = {
4843
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
@@ -51,11 +46,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
5146
response = requests.get(search_url, headers=headers)
5247
response.raise_for_status()
5348
soup = BeautifulSoup(response.text, "html.parser")
54-
49+
5550
search_results = []
5651
for result in soup.find_all('li', class_='b_algo', limit=max_results):
5752
link = result.find('a')['href']
5853
search_results.append(link)
5954
return search_results
60-
61-
raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")
55+
56+
elif search_engine.lower() == "searxng":
57+
url = f"http://localhost:{port}"
58+
params = {"q": query, "format": "json"}
59+
60+
# Send the GET request to the server
61+
response = requests.get(url, params=params)
62+
63+
# Parse the response and limit to the specified max_results
64+
data = response.json()
65+
limited_results = data["results"][:max_results]
66+
return limited_results
67+
68+
else:
69+
raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")

0 commit comments

Comments
 (0)