Skip to content

Commit ffa1067

Browse files
authored
Merge pull request #756 from shenghongtw/pre/beta
The smart_scraper_multi_graph method is too expensive
2 parents b912904 + da2a3c8 commit ffa1067

8 files changed

+371
-6
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
import os
5+
import json
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperMultiLiteGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
17+
graph_config = {
18+
"llm": {
19+
"api_key": os.getenv("OPENAI_API_KEY"),
20+
"model": "openai/gpt-4o",
21+
},
22+
"verbose": True,
23+
"headless": False,
24+
}
25+
26+
# ************************************************
27+
# Create the SmartScraperGraph instance and run it
28+
# ************************************************
29+
30+
smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph(
31+
prompt="Who is Marco Perini?",
32+
source= [
33+
"https://perinim.github.io/",
34+
"https://perinim.github.io/cv/"
35+
],
36+
config=graph_config
37+
)
38+
39+
result = smart_scraper_multi_lite_graph.run()
40+
print(json.dumps(result, indent=4))
41+
42+
# ************************************************
43+
# Get graph execution info
44+
# ************************************************
45+
46+
graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info()
47+
print(prettify_exec_info(graph_exec_info))

scrapegraphai/graphs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,5 @@
2525
from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
2626
from .code_generator_graph import CodeGeneratorGraph
2727
from .depth_search_graph import DepthSearchGraph
28+
from .smart_scraper_multi_lite_graph import SmartScraperMultiLiteGraph
29+
from .scrape_graph import ScrapeGraph

scrapegraphai/graphs/scrape_graph.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
SmartScraperGraph Module
3+
"""
4+
from typing import Optional
5+
from pydantic import BaseModel
6+
from .base_graph import BaseGraph
7+
from .abstract_graph import AbstractGraph
8+
from ..nodes import (
9+
FetchNode,
10+
ParseNode,
11+
)
12+
13+
class ScrapeGraph(AbstractGraph):
14+
"""
15+
ScrapeGraph is a scraping pipeline that automates the process of
16+
extracting information from web pages.
17+
18+
Attributes:
19+
prompt (str): The prompt for the graph.
20+
source (str): The source of the graph.
21+
config (dict): Configuration parameters for the graph.
22+
schema (BaseModel): The schema for the graph output.
23+
verbose (bool): A flag indicating whether to show print statements during execution.
24+
headless (bool): A flag indicating whether to run the graph in headless mode.
25+
26+
Args:
27+
prompt (str): The prompt for the graph.
28+
source (str): The source of the graph.
29+
config (dict): Configuration parameters for the graph.
30+
schema (BaseModel): The schema for the graph output.
31+
32+
Example:
33+
>>> scraper = ScraperGraph(
34+
... "https://en.wikipedia.org/wiki/Chioggia",
35+
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
36+
... )
37+
>>> result = smart_scraper.run()
38+
)
39+
"""
40+
41+
def __init__(self, source: str, config: dict, prompt: str = "", schema: Optional[BaseModel] = None):
42+
super().__init__(prompt, config, source, schema)
43+
44+
self.input_key = "url" if source.startswith("http") else "local_dir"
45+
46+
def _create_graph(self) -> BaseGraph:
47+
"""
48+
Creates the graph of nodes representing the workflow for web scraping.
49+
50+
Returns:
51+
BaseGraph: A graph instance representing the web scraping workflow.
52+
"""
53+
fetch_node = FetchNode(
54+
input="url| local_dir",
55+
output=["doc"],
56+
node_config={
57+
"llm_model": self.llm_model,
58+
"force": self.config.get("force", False),
59+
"cut": self.config.get("cut", True),
60+
"loader_kwargs": self.config.get("loader_kwargs", {}),
61+
"browser_base": self.config.get("browser_base"),
62+
"scrape_do": self.config.get("scrape_do")
63+
}
64+
)
65+
66+
parse_node = ParseNode(
67+
input="doc",
68+
output=["parsed_doc"],
69+
node_config={
70+
"llm_model": self.llm_model,
71+
"chunk_size": self.model_token
72+
}
73+
)
74+
75+
return BaseGraph(
76+
nodes=[
77+
fetch_node,
78+
parse_node,
79+
],
80+
edges=[
81+
(fetch_node, parse_node),
82+
],
83+
entry_point=fetch_node,
84+
graph_name=self.__class__.__name__
85+
)
86+
87+
def run(self) -> str:
88+
"""
89+
Executes the scraping process and returns the scraping content.
90+
91+
Returns:
92+
str: The scraping content.
93+
"""
94+
95+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
96+
self.final_state, self.execution_info = self.graph.execute(inputs)
97+
98+
return self.final_state.get("parsed_doc", "No document found.")

scrapegraphai/graphs/smart_scraper_multi_concat_graph.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ class SmartScraperMultiConcatGraph(AbstractGraph):
3535
schema (Optional[BaseModel]): The schema for the graph output.
3636
3737
Example:
38-
>>> search_graph = MultipleSearchGraph(
38+
>>> smart_scraper_multi_concat_graph = SmartScraperMultiConcatGraph(
3939
... "What is Chioggia famous for?",
4040
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
4141
... )
42-
>>> result = search_graph.run()
42+
>>> result = smart_scraper_multi_concat_graph.run()
4343
"""
4444

4545
def __init__(self, prompt: str, source: List[str],

scrapegraphai/graphs/smart_scraper_multi_graph.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ class SmartScraperMultiGraph(AbstractGraph):
1818
SmartScraperMultiGraph is a scraping pipeline that scrapes a
1919
list of URLs and generates answers to a given prompt.
2020
It only requires a user prompt and a list of URLs.
21+
The difference with the SmartScraperMultiLiteGraph is that in this case the content will be abstracted
22+
by llm and then merged finally passed to the llm.
2123
2224
Attributes:
2325
prompt (str): The user prompt to search the internet.
@@ -34,11 +36,15 @@ class SmartScraperMultiGraph(AbstractGraph):
3436
schema (Optional[BaseModel]): The schema for the graph output.
3537
3638
Example:
37-
>>> search_graph = MultipleSearchGraph(
38-
... "What is Chioggia famous for?",
39-
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
39+
>>> smart_scraper_multi_graph = SmartScraperMultiGraph(
40+
... prompt="Who is Marco Perini?",
41+
... source= [
42+
... "https://perinim.github.io/",
43+
... "https://perinim.github.io/cv/"
44+
... ],
45+
... config={"llm": {"model": "openai/gpt-3.5-turbo"}}
4046
... )
41-
>>> result = search_graph.run()
47+
>>> result = smart_scraper_multi_graph.run()
4248
"""
4349

4450
def __init__(self, prompt: str, source: List[str],
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
"""
2+
SmartScraperMultiGraph Module
3+
"""
4+
from copy import deepcopy
5+
from typing import List, Optional
6+
from pydantic import BaseModel
7+
from .base_graph import BaseGraph
8+
from .abstract_graph import AbstractGraph
9+
from .scrape_graph import ScrapeGraph
10+
from ..nodes import (
11+
GraphIteratorNode,
12+
MergeAnswersNode,
13+
)
14+
from ..utils.copy import safe_deepcopy
15+
16+
class SmartScraperMultiLiteGraph(AbstractGraph):
17+
"""
18+
SmartScraperMultiLiteGraph is a scraping pipeline that scrapes a
19+
list of URLs and merge the content first and finally generates answers to a given prompt.
20+
It only requires a user prompt and a list of URLs.
21+
The difference with the SmartScraperMultiGraph is that in this case the content is merged
22+
before to be passed to the llm.
23+
24+
Attributes:
25+
prompt (str): The user prompt to search the internet.
26+
llm_model (dict): The configuration for the language model.
27+
embedder_model (dict): The configuration for the embedder model.
28+
headless (bool): A flag to run the browser in headless mode.
29+
verbose (bool): A flag to display the execution information.
30+
model_token (int): The token limit for the language model.
31+
32+
Args:
33+
prompt (str): The user prompt to search the internet.
34+
source (List[str]): The source of the graph.
35+
config (dict): Configuration parameters for the graph.
36+
schema (Optional[BaseModel]): The schema for the graph output.
37+
38+
Example:
39+
>>> smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph(
40+
... prompt="Who is Marco Perini?",
41+
... source= [
42+
... "https://perinim.github.io/",
43+
... "https://perinim.github.io/cv/"
44+
... ],
45+
... config={"llm": {"model": "openai/gpt-3.5-turbo"}}
46+
... )
47+
>>> result = smart_scraper_multi_lite_graph.run()
48+
"""
49+
50+
def __init__(self, prompt: str, source: List[str],
51+
config: dict, schema: Optional[BaseModel] = None):
52+
53+
self.copy_config = safe_deepcopy(config)
54+
self.copy_schema = deepcopy(schema)
55+
super().__init__(prompt, config, source, schema)
56+
57+
def _create_graph(self) -> BaseGraph:
58+
"""
59+
Creates the graph of nodes representing the workflow for web scraping
60+
and parsing and then merge the content and generates answers to a given prompt.
61+
"""
62+
graph_iterator_node = GraphIteratorNode(
63+
input="user_prompt & urls",
64+
output=["parsed_doc"],
65+
node_config={
66+
"graph_instance": ScrapeGraph,
67+
"scraper_config": self.copy_config,
68+
},
69+
schema=self.copy_schema
70+
)
71+
72+
merge_answers_node = MergeAnswersNode(
73+
input="user_prompt & parsed_doc",
74+
output=["answer"],
75+
node_config={
76+
"llm_model": self.llm_model,
77+
"schema": self.copy_schema
78+
}
79+
)
80+
81+
return BaseGraph(
82+
nodes=[
83+
graph_iterator_node,
84+
merge_answers_node,
85+
],
86+
edges=[
87+
(graph_iterator_node, merge_answers_node),
88+
],
89+
entry_point=graph_iterator_node,
90+
graph_name=self.__class__.__name__
91+
)
92+
93+
def run(self) -> str:
94+
"""
95+
Executes the web scraping and parsing process first and
96+
then concatenate the content and generates answers to a given prompt.
97+
98+
Returns:
99+
str: The answer to the prompt.
100+
"""
101+
inputs = {"user_prompt": self.prompt, "urls": self.source}
102+
self.final_state, self.execution_info = self.graph.execute(inputs)
103+
return self.final_state.get("answer", "No answer found.")

tests/graphs/scrape_graph_test.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Module for testing the scrape graph class
3+
"""
4+
5+
import os
6+
import pytest
7+
import pandas as pd
8+
from dotenv import load_dotenv
9+
from scrapegraphai.graphs import ScrapeGraph
10+
from scrapegraphai.utils import prettify_exec_info
11+
12+
load_dotenv()
13+
14+
@pytest.fixture
15+
def graph_config():
16+
"""Configuration of the graph"""
17+
openai_key = os.getenv("OPENAI_APIKEY")
18+
return {
19+
"llm": {
20+
"api_key": openai_key,
21+
"model": "openai/gpt-3.5-turbo",
22+
},
23+
"verbose": True,
24+
"headless": False,
25+
}
26+
27+
def test_scraping_pipeline(graph_config):
28+
"""Start of the scraping pipeline"""
29+
scrape_graph = ScrapeGraph(
30+
source="https://perinim.github.io/projects/",
31+
config=graph_config,
32+
)
33+
34+
result = scrape_graph.run()
35+
36+
assert result is not None
37+
assert isinstance(result, list)
38+
39+
def test_get_execution_info(graph_config):
40+
"""Get the execution info"""
41+
scrape_graph = ScrapeGraph(
42+
source="https://perinim.github.io/projects/",
43+
config=graph_config,
44+
)
45+
46+
scrape_graph.run()
47+
48+
graph_exec_info = scrape_graph.get_execution_info()
49+
50+
assert graph_exec_info is not None

0 commit comments

Comments
 (0)