Skip to content

fix: robot node and proxyes #89

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/local_models/Ollama/smart_scraper_ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 1,
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
Expand Down
Empty file.
12 changes: 3 additions & 9 deletions examples/single_node/robot_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,16 @@
Example of custom graph using existing nodes
"""

import os
from dotenv import load_dotenv
from scrapegraphai.models import OpenAI
from scrapegraphai.models import Ollama
from scrapegraphai.nodes import RobotsNode
load_dotenv()

# ************************************************
# Define the configuration for the graph
# ************************************************

openai_key = os.getenv("OPENAI_APIKEY")

graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
"model": "ollama/llama3",
"temperature": 0,
"streaming": True
},
Expand All @@ -27,7 +21,7 @@
# Define the node
# ************************************************

llm_model = OpenAI(graph_config["llm"])
llm_model = Ollama(graph_config["llm"])

robots_node = RobotsNode(
input="url",
Expand Down
11 changes: 4 additions & 7 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from langchain_core.documents import Document
from .base_node import BaseNode
from ..utils.remover import remover
from ..utils.proxy_generator import proxy_generator


class FetchNode(BaseNode):
Expand Down Expand Up @@ -38,16 +37,14 @@ class FetchNode(BaseNode):
to succeed.
"""

def __init__(self, input: str, output: List[str], num_prox: int = True,
node_name: str = "Fetch"):
def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
"""
Initializes the FetchHTMLNode with a node name and node type.
Arguments:
node_name (str): name of the node
prox_rotation (bool): if you wamt to rotate proxies
"""
super().__init__(node_name, "node", input, output, 1)
self.num_prox = num_prox

def execute(self, state):
"""
Expand Down Expand Up @@ -80,13 +77,13 @@ def execute(self, state):
"source": "local_dir"
})]

# if it is a URL
else:
if self.num_prox > 1:
if self.node_config.get("endpoint") is not None:
loader = AsyncHtmlLoader(
source, proxies=proxy_generator(self.num_prox))
source, proxies={"http": self.node_config["endpoint"]})
else:
loader = AsyncHtmlLoader(source)

document = loader.load()
compressed_document = [
Document(page_content=remover(str(document)))]
Expand Down
15 changes: 8 additions & 7 deletions scrapegraphai/nodes/robots_node.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Module for checking if a website is scrapepable or not
Module for checking if a website is scrapepable or not
"""
from typing import List
from urllib.parse import urlparse
Expand All @@ -12,7 +12,7 @@

class RobotsNode(BaseNode):
"""
A node responsible for checking if a website is scrapepable or not.
A node responsible for checking if a website is scrapepable or not.
It uses the AsyncHtmlLoader for asynchronous
document loading.

Expand Down Expand Up @@ -59,7 +59,7 @@ def __init__(self, input: str, output: List[str], node_config: dict, force_scra
node_config (dict): Configuration parameters for the node.
force_scraping (bool): A flag indicating whether scraping should be enforced even
if disallowed by robots.txt. Defaults to True.
node_name (str, optional): The unique identifier name for the node.
node_name (str, optional): The unique identifier name for the node.
Defaults to "Robots".
"""
super().__init__(node_name, "node", input, output, 1)
Expand Down Expand Up @@ -112,11 +112,12 @@ def execute(self, state):
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
loader = AsyncHtmlLoader(f"{base_url}/robots.txt")
document = loader.load()
model = self.llm_model.model_name

if "ollama" in model:
model = model.split("/", maxsplit=1)[-1]
if "ollama" in self.llm_model.model:
self.llm_model.model = self.llm_model.model.split("/")[-1]
model = self.llm_model.model.split("/")[-1]

else:
model = self.llm_model.model_name
try:
agent = robots_dictionary[model]

Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
from .convert_to_csv import convert_to_csv
from .convert_to_json import convert_to_json
from .prettify_exec_info import prettify_exec_info
from .proxy_generator import proxy_generator
from .proxy_rotation import proxy_generator