Skip to content

Commit 26f89d8

Browse files
committed
feat: refactoring of research web
1 parent a98328c commit 26f89d8

File tree

1 file changed

+31
-18
lines changed

1 file changed

+31
-18
lines changed

scrapegraphai/utils/research_web.py

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Research_web module
2+
research_web module
33
"""
44
import re
55
from typing import List
@@ -8,69 +8,82 @@
88
import requests
99
from bs4 import BeautifulSoup
1010

11-
def search_on_web(query: str, search_engine: str = "Google",
12-
max_results: int = 10, port: int = 8080) -> List[str]:
11+
def search_on_web(query: str, search_engine: str = "Google",
12+
max_results: int = 10, port: int = 8080,
13+
timeout: int = 10) -> List[str]:
1314
"""
14-
Searches the web for a given query using specified search engine options.
15+
Searches the web for a given query using specified search
16+
engine options and filters out PDF links.
1517
1618
Args:
1719
query (str): The search query to find on the internet.
1820
search_engine (str, optional): Specifies the search engine to use,
1921
options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
2022
max_results (int, optional): The maximum number of search results to return.
2123
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
24+
timeout (int, optional): The number of seconds to wait
25+
for a response from a request. Default is 10 seconds.
2226
2327
Returns:
24-
List[str]: A list of URLs as strings that are the search results.
28+
List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
2529
2630
Raises:
2731
ValueError: If the search engine specified is not supported.
32+
requests.exceptions.Timeout: If the request times out.
2833
2934
Example:
3035
>>> search_on_web("example query", search_engine="Google", max_results=5)
3136
['http://example.com', 'http://example.org', ...]
3237
"""
3338

39+
def filter_pdf_links(links: List[str]) -> List[str]:
40+
"""
41+
Filters out any links that point to PDF files.
42+
43+
Args:
44+
links (List[str]): A list of URLs as strings.
45+
46+
Returns:
47+
List[str]: A list of URLs excluding any that end with '.pdf'.
48+
"""
49+
return [link for link in links if not link.lower().endswith('.pdf')]
50+
3451
if search_engine.lower() == "google":
3552
res = []
3653
for url in google_search(query, stop=max_results):
3754
res.append(url)
38-
return res
55+
return filter_pdf_links(res)
3956

4057
elif search_engine.lower() == "duckduckgo":
4158
research = DuckDuckGoSearchResults(max_results=max_results)
4259
res = research.run(query)
4360
links = re.findall(r'https?://[^\s,\]]+', res)
44-
return links
61+
return filter_pdf_links(links)
4562

4663
elif search_engine.lower() == "bing":
4764
headers = {
4865
"User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64)
4966
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"""
5067
}
5168
search_url = f"https://www.bing.com/search?q={query}"
52-
response = requests.get(search_url, headers=headers)
69+
response = requests.get(search_url, headers=headers, timeout=timeout)
5370
response.raise_for_status()
5471
soup = BeautifulSoup(response.text, "html.parser")
5572

5673
search_results = []
5774
for result in soup.find_all('li', class_='b_algo', limit=max_results):
5875
link = result.find('a')['href']
5976
search_results.append(link)
60-
return search_results
77+
return filter_pdf_links(search_results)
6178

6279
elif search_engine.lower() == "searxng":
6380
url = f"http://localhost:{port}"
64-
params = {"q": query,
65-
"format": "json",
66-
"engines": "google,duckduckgo,brave,qwant,bing"}
67-
68-
response = requests.get(url, params=params)
69-
81+
params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"}
82+
response = requests.get(url, params=params, timeout=timeout)
7083
data = response.json()
71-
limited_results = data["results"][:max_results]
72-
return limited_results
84+
limited_results = [result['url'] for result in data["results"][:max_results]]
85+
return filter_pdf_links(limited_results)
7386

7487
else:
75-
raise ValueError("""The only search engines available are
88+
raise ValueError("""The only search engines available are
7689
DuckDuckGo, Google, Bing, or SearXNG""")

0 commit comments

Comments
 (0)