Skip to content

Commit 6108c3e

Browse files
authored
Merge pull request #397 from VinciGit00/pre/beta2
Pre/beta2
2 parents ce0a47a + 0cb46f5 commit 6108c3e

File tree

5 files changed

+164
-11
lines changed

5 files changed

+164
-11
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Example of custom graph using existing nodes
3+
"""
4+
5+
from scrapegraphai.models import Ollama
6+
from scrapegraphai.nodes import SearchInternetNode
7+
8+
# ************************************************
9+
# Define the configuration for the graph
10+
# ************************************************
11+
12+
graph_config = {
13+
"llm": {
14+
"model": "llama3",
15+
"temperature": 0,
16+
"streaming": True
17+
},
18+
"search_engine": "google",
19+
"max_results": 3,
20+
"verbose": True
21+
}
22+
23+
# ************************************************
24+
# Define the node
25+
# ************************************************
26+
27+
llm_model = Ollama(graph_config["llm"])
28+
29+
search_node = SearchInternetNode(
30+
input="user_input",
31+
output=["search_results"],
32+
node_config={
33+
"llm_model": llm_model,
34+
"search_engine": graph_config["search_engine"],
35+
"max_results": graph_config["max_results"],
36+
"verbose": graph_config["verbose"]
37+
}
38+
)
39+
40+
# ************************************************
41+
# Test the node
42+
# ************************************************
43+
44+
state = {
45+
"user_input": "What is the capital of France?"
46+
}
47+
48+
result = search_node.execute(state)
49+
50+
print(result)

scrapegraphai/nodes/search_internet_node.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def __init__(
4343
self.verbose = (
4444
False if node_config is None else node_config.get("verbose", False)
4545
)
46+
self.search_engine = node_config.get("search_engine", "google")
4647
self.max_results = node_config.get("max_results", 3)
4748

4849
def execute(self, state: dict) -> dict:
@@ -97,7 +98,8 @@ def execute(self, state: dict) -> dict:
9798

9899
self.logger.info(f"Search Query: {search_query}")
99100

100-
answer = search_on_web(query=search_query, max_results=self.max_results)
101+
answer = search_on_web(query=search_query, max_results=self.max_results,
102+
search_engine=self.search_engine)
101103

102104
if len(answer) == 0:
103105
# raise an exception if no answer is found

scrapegraphai/utils/research_web.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,61 @@
11
"""
2-
Module for making the request on the web
2+
research web module
33
"""
44
import re
55
from typing import List
66
from langchain_community.tools import DuckDuckGoSearchResults
77
from googlesearch import search as google_search
8-
8+
import requests
9+
from bs4 import BeautifulSoup
910

1011
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
1112
"""
1213
Searches the web for a given query using specified search engine options.
1314
1415
Args:
1516
query (str): The search query to find on the internet.
16-
search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
17+
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
1718
max_results (int, optional): The maximum number of search results to return.
1819
1920
Returns:
2021
List[str]: A list of URLs as strings that are the search results.
2122
2223
Raises:
23-
ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
24+
ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
2425
2526
Example:
2627
>>> search_on_web("example query", search_engine="Google", max_results=5)
2728
['http://example.com', 'http://example.org', ...]
2829
29-
This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
30+
This function allows switching between Google, DuckDuckGo, and Bing to perform
31+
internet searches, returning a list of result URLs.
3032
"""
3133

3234
if search_engine.lower() == "google":
3335
res = []
34-
3536
for url in google_search(query, stop=max_results):
3637
res.append(url)
3738
return res
39+
3840
elif search_engine.lower() == "duckduckgo":
3941
research = DuckDuckGoSearchResults(max_results=max_results)
4042
res = research.run(query)
41-
4243
links = re.findall(r'https?://[^\s,\]]+', res)
43-
4444
return links
45-
raise ValueError(
46-
"The only search engines available are DuckDuckGo or Google")
45+
46+
elif search_engine.lower() == "bing":
47+
headers = {
48+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
49+
}
50+
search_url = f"https://www.bing.com/search?q={query}"
51+
response = requests.get(search_url, headers=headers)
52+
response.raise_for_status()
53+
soup = BeautifulSoup(response.text, "html.parser")
54+
55+
search_results = []
56+
for result in soup.find_all('li', class_='b_algo', limit=max_results):
57+
link = result.find('a')['href']
58+
search_results.append(link)
59+
return search_results
60+
61+
raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import unittest
2+
from scrapegraphai.models import Ollama
3+
from scrapegraphai.nodes import SearchInternetNode
4+
5+
class TestSearchInternetNode(unittest.TestCase):
6+
7+
def setUp(self):
8+
# Configuration for the graph
9+
self.graph_config = {
10+
"llm": {
11+
"model": "llama3",
12+
"temperature": 0,
13+
"streaming": True
14+
},
15+
"search_engine": "google",
16+
"max_results": 3,
17+
"verbose": True
18+
}
19+
20+
# Define the model
21+
self.llm_model = Ollama(self.graph_config["llm"])
22+
23+
# Initialize the SearchInternetNode
24+
self.search_node = SearchInternetNode(
25+
input="user_input",
26+
output=["search_results"],
27+
node_config={
28+
"llm_model": self.llm_model,
29+
"search_engine": self.graph_config["search_engine"],
30+
"max_results": self.graph_config["max_results"],
31+
"verbose": self.graph_config["verbose"]
32+
}
33+
)
34+
35+
def test_execute_search_node(self):
36+
# Initial state
37+
state = {
38+
"user_input": "What is the capital of France?"
39+
}
40+
41+
# Expected output
42+
expected_output = {
43+
"user_input": "What is the capital of France?",
44+
"search_results": [
45+
"https://en.wikipedia.org/wiki/Paris",
46+
"https://en.wikipedia.org/wiki/France",
47+
"https://en.wikipedia.org/wiki/%C3%8Ele-de-France"
48+
]
49+
}
50+
51+
# Execute the node
52+
result = self.search_node.execute(state)
53+
54+
# Assert the results
55+
self.assertEqual(result, expected_output)
56+
57+
if __name__ == "__main__":
58+
unittest.main()

tests/utils/research_web_test.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import pytest
2+
from scrapegraphai.utils.research_web import search_on_web # Replace with actual path to your file
3+
4+
5+
def test_google_search():
6+
"""Tests search_on_web with Google search engine."""
7+
results = search_on_web("test query", search_engine="Google", max_results=2)
8+
assert len(results) == 2
9+
# You can further assert if the results actually contain 'test query' in the title/snippet using additional libraries
10+
11+
def test_bing_search():
12+
"""Tests search_on_web with Bing search engine."""
13+
results = search_on_web("test query", search_engine="Bing", max_results=1)
14+
assert results is not None
15+
# You can further assert if the results contain '.com' or '.org' in the domain
16+
17+
18+
def test_invalid_search_engine():
19+
"""Tests search_on_web with invalid search engine."""
20+
with pytest.raises(ValueError):
21+
search_on_web("test query", search_engine="Yahoo", max_results=5)
22+
23+
24+
def test_max_results():
25+
"""Tests search_on_web with different max_results values."""
26+
results_5 = search_on_web("test query", max_results=5)
27+
results_10 = search_on_web("test query", max_results=10)
28+
assert len(results_5) <= len(results_10)

0 commit comments

Comments
 (0)