Skip to content

Commit 1981230

Browse files
committed
add multi scraper integration
1 parent 4d0d8fa commit 1981230

File tree

6 files changed

+251
-1
lines changed

6 files changed

+251
-1
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Basic example of scraping pipeline using ScriptCreatorGraph
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import ScriptCreatorMultiGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
openai_key = os.getenv("OPENAI_APIKEY")
17+
18+
graph_config = {
19+
"llm": {
20+
"api_key": openai_key,
21+
"model": "gpt-3.5-turbo",
22+
},
23+
"library": "beautifulsoup"
24+
}
25+
26+
# ************************************************
27+
# Create the ScriptCreatorGraph instance and run it
28+
# ************************************************
29+
30+
urls=[
31+
"https://schultzbergagency.com/emil-raste-karlsen/",
32+
"https://schultzbergagency.com/johanna-hedberg/",
33+
]
34+
35+
# ************************************************
36+
# Create the ScriptCreatorGraph instance and run it
37+
# ************************************************
38+
39+
script_creator_graph = ScriptCreatorMultiGraph(
40+
prompt="Find information about actors",
41+
# also accepts a string with the already downloaded HTML code
42+
source=urls,
43+
config=graph_config
44+
)
45+
46+
result = script_creator_graph.run()
47+
print(result)
48+
49+
# ************************************************
50+
# Get graph execution info
51+
# ************************************************
52+
53+
graph_exec_info = script_creator_graph.get_execution_info()
54+
print(prettify_exec_info(graph_exec_info))

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@
2020
from .json_scraper_multi import JSONScraperMultiGraph
2121
from .csv_scraper_graph_multi import CSVScraperMultiGraph
2222
from .xml_scraper_graph_multi import XMLScraperMultiGraph
23+
from .script_creator_multi_graph import ScriptCreatorMultiGraph
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
"""
2+
ScriptCreatorMultiGraph Module
3+
"""
4+
5+
from copy import copy, deepcopy
6+
from typing import List, Optional
7+
8+
from .base_graph import BaseGraph
9+
from .abstract_graph import AbstractGraph
10+
from .script_creator_graph import ScriptCreatorGraph
11+
12+
from ..nodes import (
13+
GraphIteratorNode,
14+
MergeGeneratedScriptsNode
15+
)
16+
17+
18+
class ScriptCreatorMultiGraph(AbstractGraph):
19+
"""
20+
ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list of URLs generating web scraping scripts.
21+
It only requires a user prompt and a list of URLs.
22+
Attributes:
23+
prompt (str): The user prompt to search the internet.
24+
llm_model (dict): The configuration for the language model.
25+
embedder_model (dict): The configuration for the embedder model.
26+
headless (bool): A flag to run the browser in headless mode.
27+
verbose (bool): A flag to display the execution information.
28+
model_token (int): The token limit for the language model.
29+
Args:
30+
prompt (str): The user prompt to search the internet.
31+
source (List[str]): The source of the graph.
32+
config (dict): Configuration parameters for the graph.
33+
schema (Optional[str]): The schema for the graph output.
34+
Example:
35+
>>> script_graph = ScriptCreatorMultiGraph(
36+
... "What is Chioggia famous for?",
37+
... source=[],
38+
... config={"llm": {"model": "gpt-3.5-turbo"}}
39+
... schema={}
40+
... )
41+
>>> result = script_graph.run()
42+
"""
43+
44+
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
45+
46+
self.max_results = config.get("max_results", 3)
47+
48+
if all(isinstance(value, str) for value in config.values()):
49+
self.copy_config = copy(config)
50+
else:
51+
self.copy_config = deepcopy(config)
52+
53+
super().__init__(prompt, config, source, schema)
54+
55+
def _create_graph(self) -> BaseGraph:
56+
"""
57+
Creates the graph of nodes representing the workflow for web scraping and searching.
58+
Returns:
59+
BaseGraph: A graph instance representing the web scraping and searching workflow.
60+
"""
61+
62+
# ************************************************
63+
# Create a ScriptCreatorGraph instance
64+
# ************************************************
65+
66+
script_generator_instance = ScriptCreatorGraph(
67+
prompt="",
68+
source="",
69+
config=self.copy_config,
70+
)
71+
72+
# ************************************************
73+
# Define the graph nodes
74+
# ************************************************
75+
76+
graph_iterator_node = GraphIteratorNode(
77+
input="user_prompt & urls",
78+
output=["results"],
79+
node_config={
80+
"graph_instance": script_generator_instance,
81+
}
82+
)
83+
84+
merge_scripts_node = MergeGeneratedScriptsNode(
85+
input="user_prompt & results",
86+
output=["scripts"],
87+
node_config={
88+
"llm_model": self.llm_model,
89+
"schema": self.schema
90+
}
91+
)
92+
93+
return BaseGraph(
94+
nodes=[
95+
graph_iterator_node,
96+
merge_scripts_node,
97+
],
98+
edges=[
99+
(graph_iterator_node, merge_scripts_node),
100+
],
101+
entry_point=graph_iterator_node
102+
)
103+
104+
def run(self) -> str:
105+
"""
106+
Executes the web scraping and searching process.
107+
Returns:
108+
str: The answer to the prompt.
109+
"""
110+
inputs = {"user_prompt": self.prompt, "urls": self.source}
111+
print("self.prompt", self.prompt)
112+
self.final_state, self.execution_info = self.graph.execute(inputs)
113+
print("self.prompt", self.final_state)
114+
return self.final_state.get("scripts", [])

scrapegraphai/nodes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@
2020
from .graph_iterator_node import GraphIteratorNode
2121
from .merge_answers_node import MergeAnswersNode
2222
from .generate_answer_omni_node import GenerateAnswerOmniNode
23+
from .merge_generated_scripts import MergeGeneratedScriptsNode

scrapegraphai/nodes/generate_scraper_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def execute(self, state: dict) -> dict:
100100
SOURCE: {source}
101101
QUESTION: {question}
102102
"""
103-
print("source:", self.source)
103+
104104
if len(doc) > 1:
105105
raise NotImplementedError(
106106
"Currently GenerateScraperNode cannot handle more than 1 context chunks"
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
MergeAnswersNode Module
3+
"""
4+
5+
# Imports from standard library
6+
from typing import List, Optional
7+
from tqdm import tqdm
8+
9+
# Imports from Langchain
10+
from langchain.prompts import PromptTemplate
11+
from langchain_core.output_parsers import JsonOutputParser
12+
from tqdm import tqdm
13+
14+
from ..utils.logging import get_logger
15+
16+
# Imports from the library
17+
from .base_node import BaseNode
18+
19+
20+
class MergeGeneratedScriptsNode(BaseNode):
21+
"""
22+
A node responsible for merging scripts generated.
23+
Attributes:
24+
llm_model: An instance of a language model client, configured for generating answers.
25+
verbose (bool): A flag indicating whether to show print statements during execution.
26+
Args:
27+
input (str): Boolean expression defining the input keys needed from the state.
28+
output (List[str]): List of output keys to be updated in the state.
29+
node_config (dict): Additional configuration for the node.
30+
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
31+
"""
32+
33+
def __init__(
34+
self,
35+
input: str,
36+
output: List[str],
37+
node_config: Optional[dict] = None,
38+
node_name: str = "MergeAnswers",
39+
):
40+
super().__init__(node_name, "node", input, output, 2, node_config)
41+
42+
self.llm_model = node_config["llm_model"]
43+
self.verbose = (
44+
False if node_config is None else node_config.get("verbose", False)
45+
)
46+
47+
def execute(self, state: dict) -> dict:
48+
"""
49+
Executes the node's logic to merge the answers from multiple graph instances into a
50+
single answer.
51+
Args:
52+
state (dict): The current state of the graph. The input keys will be used
53+
to fetch the correct data from the state.
54+
Returns:
55+
dict: The updated state with the output key containing the generated answer.
56+
Raises:
57+
KeyError: If the input keys are not found in the state, indicating
58+
that the necessary information for generating an answer is missing.
59+
"""
60+
61+
self.logger.info(f"--- Executing {self.node_name} Node ---")
62+
63+
# Interpret input keys based on the provided input expression
64+
input_keys = self.get_input_keys(state)
65+
66+
# Fetching data from the state based on the input keys
67+
input_data = [state[key] for key in input_keys]
68+
69+
scripts = input_data[1]
70+
71+
# merge the answers in one string
72+
for i, script_str in enumerate(scripts):
73+
print(f"Script #{i}")
74+
print("=" * 40)
75+
print(script_str)
76+
print("-" * 40)
77+
78+
# Update the state with the generated answer
79+
state.update({self.output[0]: scripts})
80+
return state

0 commit comments

Comments
 (0)