Skip to content

Update research_web.py #458

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 14, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 23 additions & 15 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,43 @@
"""
research web module
"""
import re
from typing import List
from langchain_community.tools import DuckDuckGoSearchResults
from googlesearch import search as google_search
import requests
from bs4 import BeautifulSoup

def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]:
"""
Searches the web for a given query using specified search engine options.

Args:
query (str): The search query to find on the internet.
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
max_results (int, optional): The maximum number of search results to return.
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.

Returns:
List[str]: A list of URLs as strings that are the search results.

Raises:
ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
ValueError: If the search engine specified is not supported.

Example:
>>> search_on_web("example query", search_engine="Google", max_results=5)
['http://example.com', 'http://example.org', ...]

This function allows switching between Google, DuckDuckGo, and Bing to perform
internet searches, returning a list of result URLs.
"""

if search_engine.lower() == "google":
res = []
for url in google_search(query, stop=max_results):
res.append(url)
return res

elif search_engine.lower() == "duckduckgo":
research = DuckDuckGoSearchResults(max_results=max_results)
res = research.run(query)
links = re.findall(r'https?://[^\s,\]]+', res)
return links

elif search_engine.lower() == "bing":
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
Expand All @@ -51,11 +46,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
response = requests.get(search_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

search_results = []
for result in soup.find_all('li', class_='b_algo', limit=max_results):
link = result.find('a')['href']
search_results.append(link)
return search_results

raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")

elif search_engine.lower() == "searxng":
url = f"http://localhost:{port}"
params = {"q": query, "format": "json"}

# Send the GET request to the server
response = requests.get(url, params=params)

# Parse the response and limit to the specified max_results
data = response.json()
limited_results = data["results"][:max_results]
return limited_results

else:
raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")
Loading