Skip to content

Commit 2170131

Browse files
committed
feat(proxy-rotation): add parse (IP address) or search (from broker) functionality for proxy rotation
the broker has been made fully configurable for anonymity level, admissible locations, scheme and max shape not to waste resources, unlike the original `free-proxy` package. other options have been explored (e.g., `proxybroker`, `proxybroker2`) due to their built-in proxy server and rotation capabilities, but the former is no longer maintained, and the latter has issue with any python version outside of python 3.9
1 parent db2234b commit 2170131

File tree

3 files changed

+341
-18
lines changed

3 files changed

+341
-18
lines changed

scrapegraphai/utils/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
"""
22
__init__.py file for utils folder
33
"""
4-
from .save_audio_from_bytes import save_audio_from_bytes
4+
55
from .convert_to_csv import convert_to_csv
66
from .convert_to_json import convert_to_json
77
from .prettify_exec_info import prettify_exec_info
8-
from .proxy_rotation import proxy_generator
8+
from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers
9+
from .save_audio_from_bytes import save_audio_from_bytes
10+
from .sys_dynamic_import import dynamic_import, srcfile_import

scrapegraphai/utils/proxy_rotation.py

Lines changed: 216 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,234 @@
11
"""
22
Module for rotating proxies
33
"""
4+
5+
import ipaddress
6+
import random
7+
from typing import List, Optional, Set, TypedDict
8+
9+
import requests
10+
from fp.errors import FreeProxyException
411
from fp.fp import FreeProxy
512

613

7-
def proxy_generator(num_ips: int) -> list:
8-
"""
9-
Generates a specified number of proxy IP addresses using the FreeProxy library.
14+
class ProxyBrokerCriteria(TypedDict, total=False):
15+
"""proxy broker criteria"""
16+
17+
anonymous: bool
18+
countryset: Set[str]
19+
secure: bool
20+
timeout: float
21+
search_outside_if_empty: bool
22+
23+
24+
class ProxySettings(TypedDict, total=False):
25+
"""proxy settings"""
26+
27+
server: str
28+
bypass: str
29+
username: str
30+
password: str
31+
32+
33+
class Proxy(ProxySettings):
34+
"""proxy server information"""
35+
36+
criteria: ProxyBrokerCriteria
37+
38+
39+
def search_proxy_servers(
40+
anonymous: bool = True,
41+
countryset: Optional[Set[str]] = None,
42+
secure: bool = False,
43+
timeout: float = 5.0,
44+
max_shape: int = 5,
45+
search_outside_if_empty: bool = True,
46+
) -> List[str]:
47+
"""search for proxy servers that match the specified broker criteria
1048
1149
Args:
12-
num_ips (int): The number of proxy IPs to generate and rotate through.
50+
anonymous: whether proxy servers should have minimum level-1 anonymity.
51+
countryset: admissible proxy servers locations.
52+
secure: whether proxy servers should support HTTP or HTTPS; defaults to HTTP;
53+
timeout: The maximum timeout for proxy responses; defaults to 5.0 seconds.
54+
max_shape: The maximum number of proxy servers to return; defaults to 5.
55+
search_outside_if_empty: whether countryset should be extended if empty.
1356
1457
Returns:
15-
list: A list of proxy IP addresses.
58+
A list of proxy server URLs matching the criteria.
1659
1760
Example:
18-
>>> proxy_generator(5)
61+
>>> search_proxy_servers(
62+
... anonymous=True,
63+
... countryset={"GB", "US"},
64+
... secure=True,
65+
... timeout=1.0
66+
... max_shape=2
67+
... )
1968
[
20-
'192.168.1.1:8080',
21-
'103.10.63.135:8080',
22-
'176.9.75.42:8080',
23-
'37.57.216.2:8080',
24-
'113.20.31.250:8080'
69+
"http://103.10.63.135:8080",
70+
"http://113.20.31.250:8080",
2571
]
72+
"""
73+
proxybroker = FreeProxy(
74+
anonym=anonymous,
75+
country_id=countryset,
76+
elite=True,
77+
https=secure,
78+
timeout=timeout,
79+
)
80+
81+
def search_all(proxybroker: FreeProxy, k: int, search_outside: bool) -> List[str]:
82+
candidateset = proxybroker.get_proxy_list(search_outside)
83+
random.shuffle(candidateset)
84+
85+
positive = set()
86+
87+
for address in candidateset:
88+
setting = {proxybroker.schema: f"http://{address}"}
89+
90+
try:
91+
server = proxybroker._FreeProxy__check_if_proxy_is_working(setting)
2692

27-
This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations.
93+
if not server:
94+
continue
95+
96+
positive.add(server)
97+
98+
if len(positive) < k:
99+
continue
100+
101+
return list(positive)
102+
103+
except requests.exceptions.RequestException:
104+
continue
105+
106+
n = len(positive)
107+
108+
if n < k and search_outside:
109+
proxybroker.country_id = None
110+
111+
try:
112+
negative = set(search_all(proxybroker, k - n, False))
113+
except FreeProxyException:
114+
negative = set()
115+
116+
positive = positive | negative
117+
118+
if not positive:
119+
raise FreeProxyException("missing proxy servers for criteria")
120+
121+
return list(positive)
122+
123+
return search_all(proxybroker, max_shape, search_outside_if_empty)
124+
125+
126+
def _parse_proxy(proxy: ProxySettings) -> ProxySettings:
127+
"""parses a proxy configuration with known server
128+
129+
Args:
130+
proxy: The proxy configuration to parse.
131+
132+
Returns:
133+
A 'playwright' compliant proxy configuration.
28134
"""
135+
assert "server" in proxy, "missing server in the proxy configuration"
136+
137+
auhtorization = [x in proxy for x in ("username", "password")]
138+
139+
message = "username and password must be provided in pairs or not at all"
140+
141+
assert all(auhtorization) or not any(auhtorization), message
142+
143+
parsed = {"server": proxy["server"]}
144+
145+
if proxy.get("bypass"):
146+
parsed["bypass"] = proxy["bypass"]
147+
148+
if all(auhtorization):
149+
parsed["username"] = proxy["username"]
150+
parsed["password"] = proxy["password"]
151+
152+
return parsed
153+
154+
155+
def _search_proxy(proxy: Proxy) -> ProxySettings:
156+
"""searches for a proxy server matching the specified broker criteria
157+
158+
Args:
159+
proxy: The proxy configuration to search for.
160+
161+
Returns:
162+
A 'playwright' compliant proxy configuration.
163+
"""
164+
server = search_proxy_servers(max_shape=1, **proxy.get("criteria", {}))[0]
165+
166+
return {"server": server}
167+
168+
169+
def is_ipv4_address(address: str) -> bool:
170+
"""If a proxy address conforms to a IPv4 address"""
171+
try:
172+
ipaddress.IPv4Address(address)
173+
return True
174+
except ipaddress.AddressValueError:
175+
return False
176+
177+
178+
def parse_or_search_proxy(proxy: Proxy) -> ProxySettings:
179+
"""parses a proxy configuration or searches for a new one matching
180+
the specified broker criteria
181+
182+
Args:
183+
proxy: The proxy configuration to parse or search for.
184+
185+
Returns:
186+
A 'playwright' compliant proxy configuration.
187+
188+
Notes:
189+
- If the proxy server is a IP address, it is assumed to be
190+
a proxy server address.
191+
- If the proxy server is 'broker', a proxy server is searched for
192+
based on the provided broker criteria.
193+
194+
Example:
195+
>>> proxy = {
196+
... "server": "broker",
197+
... "criteria": {
198+
... "anonymous": True,
199+
... "countryset": {"GB", "US"},
200+
... "secure": True,
201+
... "timeout": 5.0
202+
... "search_outside_if_empty": False
203+
... }
204+
... }
205+
206+
>>> parse_or_search_proxy(proxy)
207+
{
208+
"server": "<proxy-server-matching-criteria>",
209+
}
210+
211+
Example:
212+
>>> proxy = {
213+
... "server": "192.168.1.1:8080",
214+
... "username": "<username>",
215+
... "password": "<password>"
216+
... }
217+
218+
>>> parse_or_search_proxy(proxy)
219+
{
220+
"server": "192.168.1.1:8080",
221+
"username": "<username>",
222+
"password": "<password>"
223+
}
224+
"""
225+
assert "server" in proxy, "missing server in the proxy configuration"
226+
227+
server_address = proxy["server"].split(":", maxsplit=1)[0]
228+
229+
if is_ipv4_address(server_address):
230+
return _parse_proxy(proxy)
29231

30-
res = []
232+
assert proxy["server"] == "broker", "unknown proxy server"
31233

32-
for i in range(0, num_ips):
33-
res.append(FreeProxy().get())
34-
return res
234+
return _search_proxy(proxy)

tests/utils/test_proxy_rotation.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import pytest
2+
from fp.errors import FreeProxyException
3+
4+
from scrapegraphai.utils.proxy_rotation import (
5+
Proxy,
6+
_parse_proxy,
7+
_search_proxy,
8+
is_ipv4_address,
9+
parse_or_search_proxy,
10+
search_proxy_servers,
11+
)
12+
13+
14+
def test_search_proxy_servers_success():
15+
servers = search_proxy_servers(
16+
anonymous=True,
17+
countryset={"US"},
18+
secure=False,
19+
timeout=10.0,
20+
max_shape=2,
21+
search_outside_if_empty=True,
22+
)
23+
24+
assert isinstance(servers, list)
25+
assert all(isinstance(server, str) for server in servers)
26+
27+
28+
def test_search_proxy_servers_exception():
29+
with pytest.raises(FreeProxyException):
30+
search_proxy_servers(
31+
anonymous=True,
32+
countryset={"XX"},
33+
secure=True,
34+
timeout=1.0,
35+
max_shape=2,
36+
search_outside_if_empty=False,
37+
)
38+
39+
40+
def test_parse_proxy_success():
41+
proxy = {
42+
"server": "192.168.1.1:8080",
43+
"username": "user",
44+
"password": "pass",
45+
"bypass": "*.local",
46+
}
47+
48+
parsed_proxy = _parse_proxy(proxy)
49+
assert parsed_proxy == proxy
50+
51+
52+
def test_parse_proxy_exception():
53+
invalid_proxy = {"server": "192.168.1.1:8080", "username": "user"}
54+
55+
with pytest.raises(AssertionError) as error_info:
56+
_parse_proxy(invalid_proxy)
57+
58+
assert "username and password must be provided in pairs" in str(error_info.value)
59+
60+
61+
def test_search_proxy_success():
62+
proxy = Proxy(criteria={"anonymous": True, "countryset": {"US"}})
63+
found_proxy = _search_proxy(proxy)
64+
65+
assert isinstance(found_proxy, dict)
66+
assert "server" in found_proxy
67+
68+
69+
def test_is_ipv4_address():
70+
assert is_ipv4_address("192.168.1.1") is True
71+
assert is_ipv4_address("999.999.999.999") is False
72+
assert is_ipv4_address("no-address") is False
73+
74+
75+
def test_parse_or_search_proxy_success():
76+
proxy = {
77+
"server": "192.168.1.1:8080",
78+
"username": "username",
79+
"password": "password",
80+
}
81+
82+
parsed_proxy = parse_or_search_proxy(proxy)
83+
assert parsed_proxy == proxy
84+
85+
proxy_broker = {
86+
"server": "broker",
87+
"criteria": {
88+
"anonymous": True,
89+
"countryset": {"US"},
90+
"secure": True,
91+
"timeout": 10.0,
92+
},
93+
}
94+
95+
found_proxy = parse_or_search_proxy(proxy_broker)
96+
97+
assert isinstance(found_proxy, dict)
98+
assert "server" in found_proxy
99+
100+
101+
def test_parse_or_search_proxy_exception():
102+
proxy = {
103+
"username": "username",
104+
"password": "password",
105+
}
106+
107+
with pytest.raises(AssertionError) as error_info:
108+
parse_or_search_proxy(proxy)
109+
110+
assert "missing server in the proxy configuration" in str(error_info.value)
111+
112+
113+
def test_parse_or_search_proxy_unknown_server():
114+
proxy = {
115+
"server": "unknown",
116+
}
117+
118+
with pytest.raises(AssertionError) as error_info:
119+
parse_or_search_proxy(proxy)
120+
121+
assert "unknown proxy server" in str(error_info.value)

0 commit comments

Comments
 (0)