Skip to content

The smart_scraper_multi_graph method is too expensive #756

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions examples/openai/smart_scraper_multi_lite_openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperMultiLiteGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()

# ************************************************
# Define the configuration for the graph
# ************************************************


graph_config = {
"llm": {
"api_key": os.getenv("OPENAI_API_KEY"),
"model": "openai/gpt-4o",
},
"verbose": True,
"headless": False,
}

# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************

smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph(
prompt="Who is Marco Perini?",
source= [
"https://perinim.github.io/",
"https://perinim.github.io/cv/"
],
config=graph_config
)

result = smart_scraper_multi_lite_graph.run()
print(json.dumps(result, indent=4))

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
2 changes: 2 additions & 0 deletions scrapegraphai/graphs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@
from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
from .code_generator_graph import CodeGeneratorGraph
from .depth_search_graph import DepthSearchGraph
from .smart_scraper_multi_lite_graph import SmartScraperMultiLiteGraph
from .scrape_graph import ScrapeGraph
98 changes: 98 additions & 0 deletions scrapegraphai/graphs/scrape_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
SmartScraperGraph Module
"""
from typing import Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
)

class ScrapeGraph(AbstractGraph):
"""
ScrapeGraph is a scraping pipeline that automates the process of
extracting information from web pages.

Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.

Args:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.

Example:
>>> scraper = ScraperGraph(
... "https://en.wikipedia.org/wiki/Chioggia",
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
... )
>>> result = smart_scraper.run()
)
"""

def __init__(self, source: str, config: dict, prompt: str = "", schema: Optional[BaseModel] = None):
super().__init__(prompt, config, source, schema)

self.input_key = "url" if source.startswith("http") else "local_dir"

def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.

Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
fetch_node = FetchNode(
input="url| local_dir",
output=["doc"],
node_config={
"llm_model": self.llm_model,
"force": self.config.get("force", False),
"cut": self.config.get("cut", True),
"loader_kwargs": self.config.get("loader_kwargs", {}),
"browser_base": self.config.get("browser_base"),
"scrape_do": self.config.get("scrape_do")
}
)

parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"llm_model": self.llm_model,
"chunk_size": self.model_token
}
)

return BaseGraph(
nodes=[
fetch_node,
parse_node,
],
edges=[
(fetch_node, parse_node),
],
entry_point=fetch_node,
graph_name=self.__class__.__name__
)

def run(self) -> str:
"""
Executes the scraping process and returns the scraping content.

Returns:
str: The scraping content.
"""

inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

return self.final_state.get("parsed_doc", "No document found.")
4 changes: 2 additions & 2 deletions scrapegraphai/graphs/smart_scraper_multi_concat_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ class SmartScraperMultiConcatGraph(AbstractGraph):
schema (Optional[BaseModel]): The schema for the graph output.

Example:
>>> search_graph = MultipleSearchGraph(
>>> smart_scraper_multi_concat_graph = SmartScraperMultiConcatGraph(
... "What is Chioggia famous for?",
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
... )
>>> result = search_graph.run()
>>> result = smart_scraper_multi_concat_graph.run()
"""

def __init__(self, prompt: str, source: List[str],
Expand Down
14 changes: 10 additions & 4 deletions scrapegraphai/graphs/smart_scraper_multi_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ class SmartScraperMultiGraph(AbstractGraph):
SmartScraperMultiGraph is a scraping pipeline that scrapes a
list of URLs and generates answers to a given prompt.
It only requires a user prompt and a list of URLs.
The difference with the SmartScraperMultiLiteGraph is that in this case the content will be abstracted
by llm and then merged finally passed to the llm.

Attributes:
prompt (str): The user prompt to search the internet.
Expand All @@ -34,11 +36,15 @@ class SmartScraperMultiGraph(AbstractGraph):
schema (Optional[BaseModel]): The schema for the graph output.

Example:
>>> search_graph = MultipleSearchGraph(
... "What is Chioggia famous for?",
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
>>> smart_scraper_multi_graph = SmartScraperMultiGraph(
... prompt="Who is Marco Perini?",
... source= [
... "https://perinim.github.io/",
... "https://perinim.github.io/cv/"
... ],
... config={"llm": {"model": "openai/gpt-3.5-turbo"}}
... )
>>> result = search_graph.run()
>>> result = smart_scraper_multi_graph.run()
"""

def __init__(self, prompt: str, source: List[str],
Expand Down
103 changes: 103 additions & 0 deletions scrapegraphai/graphs/smart_scraper_multi_lite_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""
SmartScraperMultiGraph Module
"""
from copy import deepcopy
from typing import List, Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .scrape_graph import ScrapeGraph
from ..nodes import (
GraphIteratorNode,
MergeAnswersNode,
)
from ..utils.copy import safe_deepcopy

class SmartScraperMultiLiteGraph(AbstractGraph):
"""
SmartScraperMultiLiteGraph is a scraping pipeline that scrapes a
list of URLs and merge the content first and finally generates answers to a given prompt.
It only requires a user prompt and a list of URLs.
The difference with the SmartScraperMultiGraph is that in this case the content is merged
before to be passed to the llm.

Attributes:
prompt (str): The user prompt to search the internet.
llm_model (dict): The configuration for the language model.
embedder_model (dict): The configuration for the embedder model.
headless (bool): A flag to run the browser in headless mode.
verbose (bool): A flag to display the execution information.
model_token (int): The token limit for the language model.

Args:
prompt (str): The user prompt to search the internet.
source (List[str]): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (Optional[BaseModel]): The schema for the graph output.

Example:
>>> smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph(
... prompt="Who is Marco Perini?",
... source= [
... "https://perinim.github.io/",
... "https://perinim.github.io/cv/"
... ],
... config={"llm": {"model": "openai/gpt-3.5-turbo"}}
... )
>>> result = smart_scraper_multi_lite_graph.run()
"""

def __init__(self, prompt: str, source: List[str],
config: dict, schema: Optional[BaseModel] = None):

self.copy_config = safe_deepcopy(config)
self.copy_schema = deepcopy(schema)
super().__init__(prompt, config, source, schema)

def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping
and parsing and then merge the content and generates answers to a given prompt.
"""
graph_iterator_node = GraphIteratorNode(
input="user_prompt & urls",
output=["parsed_doc"],
node_config={
"graph_instance": ScrapeGraph,
"scraper_config": self.copy_config,
},
schema=self.copy_schema
)

merge_answers_node = MergeAnswersNode(
input="user_prompt & parsed_doc",
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.copy_schema
}
)

return BaseGraph(
nodes=[
graph_iterator_node,
merge_answers_node,
],
edges=[
(graph_iterator_node, merge_answers_node),
],
entry_point=graph_iterator_node,
graph_name=self.__class__.__name__
)

def run(self) -> str:
"""
Executes the web scraping and parsing process first and
then concatenate the content and generates answers to a given prompt.

Returns:
str: The answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, "urls": self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.")
50 changes: 50 additions & 0 deletions tests/graphs/scrape_graph_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Module for testing the scrape graph class
"""

import os
import pytest
import pandas as pd
from dotenv import load_dotenv
from scrapegraphai.graphs import ScrapeGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()

@pytest.fixture
def graph_config():
"""Configuration of the graph"""
openai_key = os.getenv("OPENAI_APIKEY")
return {
"llm": {
"api_key": openai_key,
"model": "openai/gpt-3.5-turbo",
},
"verbose": True,
"headless": False,
}

def test_scraping_pipeline(graph_config):
"""Start of the scraping pipeline"""
scrape_graph = ScrapeGraph(
source="https://perinim.github.io/projects/",
config=graph_config,
)

result = scrape_graph.run()

assert result is not None
assert isinstance(result, list)

def test_get_execution_info(graph_config):
"""Get the execution info"""
scrape_graph = ScrapeGraph(
source="https://perinim.github.io/projects/",
config=graph_config,
)

scrape_graph.run()

graph_exec_info = scrape_graph.get_execution_info()

assert graph_exec_info is not None
Loading