Skip to content

feat: add new proxy rotation function #86

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from langchain_core.documents import Document
from .base_node import BaseNode
from ..utils.remover import remover
from ..utils.proxy_generator import proxy_generator


class FetchNode(BaseNode):
Expand Down Expand Up @@ -37,13 +38,16 @@ class FetchNode(BaseNode):
to succeed.
"""

def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
def __init__(self, input: str, output: List[str], num_prox: int = True,
node_name: str = "Fetch"):
"""
Initializes the FetchHTMLNode with a node name and node type.
Arguments:
node_name (str): name of the node
prox_rotation (bool): if you wamt to rotate proxies
"""
super().__init__(node_name, "node", input, output, 1)
self.num_prox = num_prox

def execute(self, state):
"""
Expand Down Expand Up @@ -78,7 +82,11 @@ def execute(self, state):

# if it is a URL
else:
loader = AsyncHtmlLoader(source)
if self.num_prox > 1:
loader = AsyncHtmlLoader(
source, proxies=proxy_generator(self.num_prox))
else:
loader = AsyncHtmlLoader(source)
document = loader.load()
compressed_document = [
Document(page_content=remover(str(document)))]
Expand Down
1 change: 1 addition & 0 deletions scrapegraphai/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .convert_to_csv import convert_to_csv
from .convert_to_json import convert_to_json
from .prettify_exec_info import prettify_exec_info
from .proxy_generator import proxy_generator
31 changes: 31 additions & 0 deletions scrapegraphai/utils/proxy_rotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Module for rotating proxies
"""
from fp.fp import FreeProxy


def proxy_generator(num_ips: int):
"""
Rotates through a specified number of proxy IPs using the FreeProxy library.

Args:
num_ips (int): The number of proxy IPs to rotate through.

Returns:
dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation.

Example:
>>> proxy_generator(5)
{
0: '192.168.1.1:8080',
1: '103.10.63.135:8080',
2: '176.9.75.42:8080',
3: '37.57.216.2:8080',
4: '113.20.31.250:8080'
}
"""
res = []

for i in range(0, num_ips):
res.append(FreeProxy().get())
return res