Skip to content

fix: bug for calculate costs #91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
## [0.4.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.4.0-beta.1...v0.4.0-beta.2) (2024-04-27)


### Bug Fixes

* robot node and proxyes ([adbc08f](https://github.com/VinciGit00/Scrapegraph-ai/commit/adbc08f27bc0966822f054f3af0e1f94fc0b87f5))

## [0.4.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.3.0...v0.4.0-beta.1) (2024-04-27)


### Features

* add new proxy rotation function ([f6077d1](https://github.com/VinciGit00/Scrapegraph-ai/commit/f6077d1f98023ac3bf0c89ef6b3d67dde4818df7))


### Bug Fixes

* changed proxy function ([b754dd9](https://github.com/VinciGit00/Scrapegraph-ai/commit/b754dd909cd2aa2d5b5d94d9c7879ba3da58adc4))

## [0.3.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.2.8...v0.3.0) (2024-04-26)


Expand Down
2 changes: 1 addition & 1 deletion examples/local_models/Ollama/smart_scraper_ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 1,
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
Expand Down
27 changes: 27 additions & 0 deletions examples/single_node/fetch_node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
Example of custom graph using existing nodes
"""

from scrapegraphai.nodes import FetchNode

# ************************************************
# Define the node
# ************************************************


robots_node = FetchNode(
input="url | local_dir",
output=["doc"],
)

# ************************************************
# Test the node
# ************************************************

state = {
"url": "https://twitter.com/home"
}

result = robots_node.execute(state)

print(result)
12 changes: 3 additions & 9 deletions examples/single_node/robot_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,16 @@
Example of custom graph using existing nodes
"""

import os
from dotenv import load_dotenv
from scrapegraphai.models import OpenAI
from scrapegraphai.models import Ollama
from scrapegraphai.nodes import RobotsNode
load_dotenv()

# ************************************************
# Define the configuration for the graph
# ************************************************

openai_key = os.getenv("OPENAI_APIKEY")

graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
"model": "ollama/llama3",
"temperature": 0,
"streaming": True
},
Expand All @@ -27,7 +21,7 @@
# Define the node
# ************************************************

llm_model = OpenAI(graph_config["llm"])
llm_model = Ollama(graph_config["llm"])

robots_node = RobotsNode(
input="url",
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scrapegraphai"
version = "0.3.1"
version = "0.4.0b2"
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
authors = [
"Marco Vinciguerra <[email protected]>",
Expand Down
32 changes: 18 additions & 14 deletions scrapegraphai/graphs/base_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _create_edges(self, edges: list) -> dict:
edge_dict[from_node.node_name] = to_node.node_name
return edge_dict

def execute(self, initial_state: dict) -> dict:
def execute(self, initial_state: dict) -> (dict, list):
"""
Executes the graph by traversing nodes starting from the entry point. The execution
follows the edges based on the result of each node's execution and continues until
Expand All @@ -68,13 +68,12 @@ def execute(self, initial_state: dict) -> dict:
Returns:
dict: The state after execution has completed, which may have been altered by the nodes.
"""
print(self.nodes)
current_node_name = self.nodes[0]
state = initial_state

# variables for tracking execution info
total_exec_time = 0.0
exec_info = {}
exec_info = []
cb_total = {
"total_tokens": 0,
"prompt_tokens": 0,
Expand All @@ -94,18 +93,19 @@ def execute(self, initial_state: dict) -> dict:
total_exec_time += node_exec_time

cb = {
"node_name": index.node_name,
"total_tokens": cb.total_tokens,
"prompt_tokens": cb.prompt_tokens,
"completion_tokens": cb.completion_tokens,
"successful_requests": cb.successful_requests,
"total_cost_USD": cb.total_cost,
}

exec_info[current_node_name] = {
"exec_time": node_exec_time,
"model_info": cb
}

exec_info.append(
cb
)

cb_total["total_tokens"] += cb["total_tokens"]
cb_total["prompt_tokens"] += cb["prompt_tokens"]
cb_total["completion_tokens"] += cb["completion_tokens"]
Expand All @@ -119,10 +119,14 @@ def execute(self, initial_state: dict) -> dict:
else:
current_node_name = None

execution_info = {
"total_exec_time": total_exec_time,
"total_model_info": cb_total,
"nodes_info": exec_info
}

return state, execution_info
exec_info.append({
"node_name": "TOTAL RESULT",
"total_tokens": cb_total["total_tokens"],
"prompt_tokens": cb_total["prompt_tokens"],
"completion_tokens": cb_total["completion_tokens"],
"successful_requests": cb_total["successful_requests"],
"total_cost_USD": cb_total["total_cost_USD"],
"exec_time": total_exec_time,
})

return state, exec_info
11 changes: 8 additions & 3 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
Initializes the FetchHTMLNode with a node name and node type.
Arguments:
node_name (str): name of the node
prox_rotation (bool): if you wamt to rotate proxies
"""
super().__init__(node_name, "node", input, output, 1)

Expand All @@ -58,7 +59,7 @@ def execute(self, state):

Raises:
KeyError: If the 'url' key is not found in the state, indicating that the
necessary information to perform the operation is missing.
necessary information to perform the operation is missing.
"""
print(f"--- Executing {self.node_name} Node ---")

Expand All @@ -76,9 +77,13 @@ def execute(self, state):
"source": "local_dir"
})]

# if it is a URL
else:
loader = AsyncHtmlLoader(source)
if self.node_config is not None and self.node_config.get("endpoint") is not None:
loader = AsyncHtmlLoader(
source, proxies={"http": self.node_config["endpoint"]})
else:
loader = AsyncHtmlLoader(source)

document = loader.load()
compressed_document = [
Document(page_content=remover(str(document)))]
Expand Down
15 changes: 8 additions & 7 deletions scrapegraphai/nodes/robots_node.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Module for checking if a website is scrapepable or not
Module for checking if a website is scrapepable or not
"""
from typing import List
from urllib.parse import urlparse
Expand All @@ -12,7 +12,7 @@

class RobotsNode(BaseNode):
"""
A node responsible for checking if a website is scrapepable or not.
A node responsible for checking if a website is scrapepable or not.
It uses the AsyncHtmlLoader for asynchronous
document loading.

Expand Down Expand Up @@ -59,7 +59,7 @@ def __init__(self, input: str, output: List[str], node_config: dict, force_scra
node_config (dict): Configuration parameters for the node.
force_scraping (bool): A flag indicating whether scraping should be enforced even
if disallowed by robots.txt. Defaults to True.
node_name (str, optional): The unique identifier name for the node.
node_name (str, optional): The unique identifier name for the node.
Defaults to "Robots".
"""
super().__init__(node_name, "node", input, output, 1)
Expand Down Expand Up @@ -112,11 +112,12 @@ def execute(self, state):
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
loader = AsyncHtmlLoader(f"{base_url}/robots.txt")
document = loader.load()
model = self.llm_model.model_name

if "ollama" in model:
model = model.split("/", maxsplit=1)[-1]
if "ollama" in self.llm_model.model:
self.llm_model.model = self.llm_model.model.split("/")[-1]
model = self.llm_model.model.split("/")[-1]

else:
model = self.llm_model.model_name
try:
agent = robots_dictionary[model]

Expand Down
1 change: 1 addition & 0 deletions scrapegraphai/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .convert_to_csv import convert_to_csv
from .convert_to_json import convert_to_json
from .prettify_exec_info import prettify_exec_info
from .proxy_rotation import proxy_generator
35 changes: 4 additions & 31 deletions scrapegraphai/utils/prettify_exec_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,44 +5,17 @@
import pandas as pd


def prettify_exec_info(complete_result: dict) -> pd.DataFrame:
def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame:
"""
Transform the execution information of the graph into a DataFrame for better visualization.

Args:
- complete_result (dict): The complete execution information of the graph.
- complete_result (list[dict]): The complete execution information of the graph.

Returns:
- pd.DataFrame: The execution information of the graph in a DataFrame.
"""

nodes_info = complete_result['nodes_info']
total_info = {
'total_exec_time': complete_result['total_exec_time'],
'total_model_info': complete_result['total_model_info']
}
df_nodes = pd.DataFrame(complete_result)

# Convert node-specific information to DataFrame
flat_data = []
for node_name, node_info in nodes_info.items():
flat_data.append({
'Node': node_name,
'Execution Time': node_info['exec_time'],
# Unpack the model_info dict into the row
**node_info['model_info']
})

df_nodes = pd.DataFrame(flat_data)

# Add a row for the total execution time and total model info
total_row = {
'Node': 'Total',
'Execution Time': total_info['total_exec_time'],
# Unpack the total_model_info dict into the row
**total_info['total_model_info']
}
df_total = pd.DataFrame([total_row])

# Combine the nodes DataFrame with the total info DataFrame
df_combined_with_total = pd.concat([df_nodes, df_total], ignore_index=True)
return df_combined_with_total
return df_nodes
31 changes: 31 additions & 0 deletions scrapegraphai/utils/proxy_rotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Module for rotating proxies
"""
from fp.fp import FreeProxy


def proxy_generator(num_ips: int):
"""
Rotates through a specified number of proxy IPs using the FreeProxy library.

Args:
num_ips (int): The number of proxy IPs to rotate through.

Returns:
dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation.

Example:
>>> proxy_generator(5)
{
0: '192.168.1.1:8080',
1: '103.10.63.135:8080',
2: '176.9.75.42:8080',
3: '37.57.216.2:8080',
4: '113.20.31.250:8080'
}
"""
res = []

for i in range(0, num_ips):
res.append(FreeProxy().get())
return res
1 change: 0 additions & 1 deletion tests/nodes/.env.example

This file was deleted.

44 changes: 44 additions & 0 deletions tests/nodes/fetch_node_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Module for testinh robot_node
"""
import pytest
from scrapegraphai.nodes import FetchNode


@pytest.fixture
def setup():
"""
setup
"""
# ************************************************
# Define the node
# ************************************************

robots_node = FetchNode(
input="url | local_dir",
output=["doc"],
)

return robots_node

# ************************************************
# Test the node
# ************************************************


def test_robots_node(setup):
"""
Run the tests
"""
state = {
"url": "https://twitter.com/home"
}

result = setup.execute(state)

assert result is not None


# If you need to run this script directly
if __name__ == "__main__":
pytest.main()
Loading