Skip to content

Commit 0a275d5

Browse files
authored
Merge pull request #743 from aziz-ullah-khan/pre/beta
Replacement of Google search to googlesearch-python and integration of Proxy.
2 parents 528a974 + e828c70 commit 0a275d5

File tree

6 files changed

+91
-10
lines changed

6 files changed

+91
-10
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Basic example of scraping pipeline using Code Generator with schema
3+
"""
4+
5+
import os, json
6+
from typing import List
7+
from dotenv import load_dotenv
8+
from pydantic import BaseModel, Field
9+
from scrapegraphai.graphs import CodeGeneratorGraph
10+
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Define the output schema for the graph
15+
# ************************************************
16+
17+
class Project(BaseModel):
18+
title: str = Field(description="The title of the project")
19+
description: str = Field(description="The description of the project")
20+
21+
class Projects(BaseModel):
22+
projects: List[Project]
23+
24+
# ************************************************
25+
# Define the configuration for the graph
26+
# ************************************************
27+
28+
together_key = os.getenv("TOGETHER_KEY")
29+
30+
graph_config = {
31+
"llm": {
32+
"model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
33+
"api_key": together_key,
34+
},
35+
"verbose": True,
36+
"headless": False,
37+
"reduction": 2,
38+
"max_iterations": {
39+
"overall": 10,
40+
"syntax": 3,
41+
"execution": 3,
42+
"validation": 3,
43+
"semantic": 3
44+
},
45+
"output_file_name": "extracted_data.py"
46+
}
47+
48+
# ************************************************
49+
# Create the SmartScraperGraph instance and run it
50+
# ************************************************
51+
52+
code_generator_graph = CodeGeneratorGraph(
53+
prompt="List me all the projects with their description",
54+
source="https://perinim.github.io/projects/",
55+
schema=Projects,
56+
config=graph_config
57+
)
58+
59+
result = code_generator_graph.run()
60+
print(result)

pyproject.toml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,12 @@ dependencies = [
2828
"free-proxy>=1.1.1",
2929
"playwright>=1.43.0",
3030
"undetected-playwright>=0.3.0",
31-
"google>=3.0.0",
3231
"langchain-ollama>=0.1.3",
33-
"simpleeval>=1.0.0",
34-
"semchunk>=2.2.0",
35-
"transformers>=4.44.2",
3632
"qdrant-client>=1.11.3",
3733
"fastembed>=0.3.6"
34+
"semchunk>=2.2.0",
35+
"transformers>=4.44.2",
36+
"googlesearch-python>=1.2.5"
3837
]
3938

4039
license = "MIT"

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ minify-html>=0.15.0
1515
free-proxy>=1.1.1
1616
playwright>=1.43.0
1717
undetected-playwright>=0.3.0
18-
google>=3.0.0
1918
semchunk>=1.0.1
2019
langchain-ollama>=0.1.3
21-
simpleeval>=0.9.13
20+
simpleeval>=0.9.13
21+
googlesearch-python>=1.2.5

scrapegraphai/graphs/search_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def _create_graph(self) -> BaseGraph:
6565
node_config={
6666
"llm_model": self.llm_model,
6767
"max_results": self.max_results,
68+
"loader_kwargs": self.loader_kwargs,
6869
"search_engine": self.copy_config.get("search_engine")
6970
}
7071
)

scrapegraphai/nodes/search_internet_node.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def __init__(
4141
self.verbose = (
4242
False if node_config is None else node_config.get("verbose", False)
4343
)
44+
self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
4445
self.search_engine = (
4546
node_config["search_engine"]
4647
if node_config.get("search_engine")
@@ -93,8 +94,8 @@ def execute(self, state: dict) -> dict:
9394

9495
self.logger.info(f"Search Query: {search_query}")
9596

96-
answer = search_on_web(query=search_query, max_results=self.max_results,
97-
search_engine=self.search_engine)
97+
answer = search_on_web(query=search_query, num_results=self.max_results,
98+
search_engine=self.search_engine, proxy=self.proxy)
9899

99100
if len(answer) == 0:
100101
raise ValueError("Zero results found for the search query.")

scrapegraphai/utils/research_web.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
def search_on_web(query: str, search_engine: str = "Google",
1212
max_results: int = 10, port: int = 8080,
13-
timeout: int = 10) -> List[str]:
13+
timeout: int = 10, proxy: str | dict = None) -> List[str]:
1414
"""
1515
Searches the web for a given query using specified search
1616
engine options and filters out PDF links.
@@ -23,6 +23,7 @@ def search_on_web(query: str, search_engine: str = "Google",
2323
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
2424
timeout (int, optional): The number of seconds to wait
2525
for a response from a request. Default is 10 seconds.
26+
proxy (dict or string, optional): The proxy server to use for the request. Default is None.
2627
2728
Returns:
2829
List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
@@ -36,6 +37,22 @@ def search_on_web(query: str, search_engine: str = "Google",
3637
['http://example.com', 'http://example.org', ...]
3738
"""
3839

40+
def format_proxy(proxy):
41+
if isinstance(proxy, dict):
42+
server = proxy.get('server')
43+
username = proxy.get('username')
44+
password = proxy.get('password')
45+
46+
if all([username, password, server]):
47+
proxy_url = f"http://{username}:{password}@{server}"
48+
return proxy_url
49+
else:
50+
raise ValueError("Proxy dictionary is missing required fields.")
51+
elif isinstance(proxy, str):
52+
return proxy # "https://username:password@ip:port"
53+
else:
54+
raise TypeError("Proxy should be a dictionary or a string.")
55+
3956
def filter_pdf_links(links: List[str]) -> List[str]:
4057
"""
4158
Filters out any links that point to PDF files.
@@ -48,9 +65,12 @@ def filter_pdf_links(links: List[str]) -> List[str]:
4865
"""
4966
return [link for link in links if not link.lower().endswith('.pdf')]
5067

68+
if proxy:
69+
proxy = format_proxy(proxy)
70+
5171
if search_engine.lower() == "google":
5272
res = []
53-
for url in google_search(query, stop=max_results):
73+
for url in google_search(query, num_results=max_results, proxy=proxy):
5474
res.append(url)
5575
return filter_pdf_links(res)
5676

0 commit comments

Comments
 (0)