Skip to content

Commit a84176f

Browse files
authored
Merge pull request #715 from ScrapeGraphAI/713-pdf-scrapping
713 pdf scrapping
2 parents ac31d7f + 99ad654 commit a84176f

File tree

2 files changed

+35
-19
lines changed

2 files changed

+35
-19
lines changed

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
1+
12
## [1.26.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.26.0-beta.1) (2024-09-29)
23

34

4-
### Features
55

66
* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81))
77
* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24))
88

99

10+
1011
### Bug Fixes
1112

13+
* removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279))
14+
1215
* integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04))
1316
* removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279))
1417

scrapegraphai/utils/research_web.py

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Research_web module
2+
research_web module
33
"""
44
import re
55
from typing import List
@@ -8,69 +8,82 @@
88
import requests
99
from bs4 import BeautifulSoup
1010

11-
def search_on_web(query: str, search_engine: str = "Google",
12-
max_results: int = 10, port: int = 8080) -> List[str]:
11+
def search_on_web(query: str, search_engine: str = "Google",
12+
max_results: int = 10, port: int = 8080,
13+
timeout: int = 10) -> List[str]:
1314
"""
14-
Searches the web for a given query using specified search engine options.
15+
Searches the web for a given query using specified search
16+
engine options and filters out PDF links.
1517
1618
Args:
1719
query (str): The search query to find on the internet.
1820
search_engine (str, optional): Specifies the search engine to use,
1921
options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
2022
max_results (int, optional): The maximum number of search results to return.
2123
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
24+
timeout (int, optional): The number of seconds to wait
25+
for a response from a request. Default is 10 seconds.
2226
2327
Returns:
24-
List[str]: A list of URLs as strings that are the search results.
28+
List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
2529
2630
Raises:
2731
ValueError: If the search engine specified is not supported.
32+
requests.exceptions.Timeout: If the request times out.
2833
2934
Example:
3035
>>> search_on_web("example query", search_engine="Google", max_results=5)
3136
['http://example.com', 'http://example.org', ...]
3237
"""
3338

39+
def filter_pdf_links(links: List[str]) -> List[str]:
40+
"""
41+
Filters out any links that point to PDF files.
42+
43+
Args:
44+
links (List[str]): A list of URLs as strings.
45+
46+
Returns:
47+
List[str]: A list of URLs excluding any that end with '.pdf'.
48+
"""
49+
return [link for link in links if not link.lower().endswith('.pdf')]
50+
3451
if search_engine.lower() == "google":
3552
res = []
3653
for url in google_search(query, stop=max_results):
3754
res.append(url)
38-
return res
55+
return filter_pdf_links(res)
3956

4057
elif search_engine.lower() == "duckduckgo":
4158
research = DuckDuckGoSearchResults(max_results=max_results)
4259
res = research.run(query)
4360
links = re.findall(r'https?://[^\s,\]]+', res)
44-
return links
61+
return filter_pdf_links(links)
4562

4663
elif search_engine.lower() == "bing":
4764
headers = {
4865
"User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64)
4966
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"""
5067
}
5168
search_url = f"https://www.bing.com/search?q={query}"
52-
response = requests.get(search_url, headers=headers)
69+
response = requests.get(search_url, headers=headers, timeout=timeout)
5370
response.raise_for_status()
5471
soup = BeautifulSoup(response.text, "html.parser")
5572

5673
search_results = []
5774
for result in soup.find_all('li', class_='b_algo', limit=max_results):
5875
link = result.find('a')['href']
5976
search_results.append(link)
60-
return search_results
77+
return filter_pdf_links(search_results)
6178

6279
elif search_engine.lower() == "searxng":
6380
url = f"http://localhost:{port}"
64-
params = {"q": query,
65-
"format": "json",
66-
"engines": "google,duckduckgo,brave,qwant,bing"}
67-
68-
response = requests.get(url, params=params)
69-
81+
params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"}
82+
response = requests.get(url, params=params, timeout=timeout)
7083
data = response.json()
71-
limited_results = data["results"][:max_results]
72-
return limited_results
84+
limited_results = [result['url'] for result in data["results"][:max_results]]
85+
return filter_pdf_links(limited_results)
7386

7487
else:
75-
raise ValueError("""The only search engines available are
88+
raise ValueError("""The only search engines available are
7689
DuckDuckGo, Google, Bing, or SearXNG""")

0 commit comments

Comments
 (0)