Skip to content

Commit bd4b26d

Browse files
committed
feat: ConcatNode.py added for heavy merge operations
1 parent fccf034 commit bd4b26d

File tree

5 files changed

+235
-0
lines changed

5 files changed

+235
-0
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperMultiConcatGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
graph_config = {
17+
"llm": {
18+
"model": "ollama/llama3.1",
19+
"temperature": 0,
20+
"format": "json", # Ollama needs the format to be specified explicitly
21+
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
22+
},
23+
"verbose": True,
24+
"headless": False
25+
}
26+
27+
# *******************************************************
28+
# Create the SmartScraperMultiGraph instance and run it
29+
# *******************************************************
30+
31+
multiple_search_graph = SmartScraperMultiConcatGraph(
32+
prompt="Who is Marco Perini?",
33+
source= [
34+
"https://perinim.github.io/",
35+
"https://perinim.github.io/cv/"
36+
],
37+
schema=None,
38+
config=graph_config
39+
)
40+
41+
result = multiple_search_graph.run()
42+
print(json.dumps(result, indent=4))

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@
2525
from .markdown_scraper_multi_graph import MDScraperMultiGraph
2626
from .search_link_graph import SearchLinkGraph
2727
from .screenshot_scraper_graph import ScreenshotScraperGraph
28+
from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
"""
2+
SmartScraperMultiGraph Module
3+
"""
4+
5+
from copy import copy, deepcopy
6+
from typing import List, Optional
7+
from pydantic import BaseModel
8+
9+
from .base_graph import BaseGraph
10+
from .abstract_graph import AbstractGraph
11+
from .smart_scraper_graph import SmartScraperGraph
12+
13+
from ..nodes import (
14+
GraphIteratorNode,
15+
ConcatAnswersNode
16+
)
17+
18+
19+
class SmartScraperMultiConcatGraph(AbstractGraph):
20+
"""
21+
SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
22+
It only requires a user prompt and a list of URLs.
23+
24+
Attributes:
25+
prompt (str): The user prompt to search the internet.
26+
llm_model (dict): The configuration for the language model.
27+
embedder_model (dict): The configuration for the embedder model.
28+
headless (bool): A flag to run the browser in headless mode.
29+
verbose (bool): A flag to display the execution information.
30+
model_token (int): The token limit for the language model.
31+
32+
Args:
33+
prompt (str): The user prompt to search the internet.
34+
source (List[str]): The source of the graph.
35+
config (dict): Configuration parameters for the graph.
36+
schema (Optional[BaseModel]): The schema for the graph output.
37+
38+
Example:
39+
>>> search_graph = SmartScraperMultiConcatGraph(
40+
... "What is Chioggia famous for?",
41+
... {"llm": {"model": "gpt-3.5-turbo"}}
42+
... )
43+
>>> result = search_graph.run()
44+
"""
45+
46+
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
47+
48+
if all(isinstance(value, str) for value in config.values()):
49+
self.copy_config = copy(config)
50+
else:
51+
self.copy_config = deepcopy(config)
52+
53+
self.copy_schema = deepcopy(schema)
54+
55+
super().__init__(prompt, config, source, schema)
56+
57+
def _create_graph(self) -> BaseGraph:
58+
"""
59+
Creates the graph of nodes representing the workflow for web scraping and searching.
60+
61+
Returns:
62+
BaseGraph: A graph instance representing the web scraping and searching workflow.
63+
"""
64+
65+
# ************************************************
66+
# Create a SmartScraperGraph instance
67+
# ************************************************
68+
69+
smart_scraper_instance = SmartScraperGraph(
70+
prompt="",
71+
source="",
72+
config=self.copy_config,
73+
schema=self.copy_schema
74+
)
75+
76+
# ************************************************
77+
# Define the graph nodes
78+
# ************************************************
79+
80+
graph_iterator_node = GraphIteratorNode(
81+
input="user_prompt & urls",
82+
output=["results"],
83+
node_config={
84+
"graph_instance": smart_scraper_instance,
85+
}
86+
)
87+
88+
concat_answers_node = ConcatAnswersNode(
89+
input="results",
90+
output=["answer"]
91+
)
92+
93+
return BaseGraph(
94+
nodes=[
95+
graph_iterator_node,
96+
concat_answers_node,
97+
],
98+
edges=[
99+
(graph_iterator_node, concat_answers_node),
100+
],
101+
entry_point=graph_iterator_node,
102+
graph_name=self.__class__.__name__
103+
)
104+
105+
def run(self) -> str:
106+
"""
107+
Executes the web scraping and searching process.
108+
109+
Returns:
110+
str: The answer to the prompt.
111+
"""
112+
inputs = {"user_prompt": self.prompt, "urls": self.source}
113+
self.final_state, self.execution_info = self.graph.execute(inputs)
114+
115+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/nodes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@
2222
from .merge_generated_scripts import MergeGeneratedScriptsNode
2323
from .fetch_screen_node import FetchScreenNode
2424
from .generate_answer_from_image_node import GenerateAnswerFromImageNode
25+
from .concat_answers_node import ConcatAnswersNode
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
ConcatAnswersNode Module
3+
"""
4+
5+
from typing import List, Optional
6+
from ..utils.logging import get_logger
7+
from .base_node import BaseNode
8+
9+
class ConcatAnswersNode(BaseNode):
10+
"""
11+
A node responsible for concatenating the answers from multiple graph instances into a single answer.
12+
13+
Attributes:
14+
verbose (bool): A flag indicating whether to show print statements during execution.
15+
16+
Args:
17+
input (str): Boolean expression defining the input keys needed from the state.
18+
output (List[str]): List of output keys to be updated in the state.
19+
node_config (dict): Additional configuration for the node.
20+
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
21+
"""
22+
23+
def __init__(
24+
self,
25+
input: str,
26+
output: List[str],
27+
node_config: Optional[dict] = None,
28+
node_name: str = "ConcatAnswers",
29+
):
30+
super().__init__(node_name, "node", input, output, 1, node_config)
31+
32+
self.verbose = (
33+
False if node_config is None else node_config.get("verbose", False)
34+
)
35+
36+
def _merge_dict(self, items):
37+
38+
return {"products": {f"item_{i+1}": item for i, item in enumerate(items)}}
39+
40+
def execute(self, state: dict) -> dict:
41+
"""
42+
Executes the node's logic to concatenate the answers from multiple graph instances into a
43+
single answer.
44+
45+
Args:
46+
state (dict): The current state of the graph. The input keys will be used
47+
to fetch the correct data from the state.
48+
49+
Returns:
50+
dict: The updated state with the output key containing the generated answer.
51+
52+
Raises:
53+
KeyError: If the input keys are not found in the state, indicating
54+
that the necessary information for generating an answer is missing.
55+
"""
56+
57+
self.logger.info(f"--- Executing {self.node_name} Node ---")
58+
59+
# Interpret input keys based on the provided input expression
60+
input_keys = self.get_input_keys(state)
61+
62+
# Fetching data from the state based on the input keys
63+
input_data = [state[key] for key in input_keys]
64+
65+
answers = input_data[0]
66+
67+
if len(answers) > 1:
68+
# merge the answers in one string
69+
answer = self._merge_dict(answers)
70+
71+
# Update the state with the generated answer
72+
state.update({self.output[0]: answer})
73+
74+
else:
75+
state.update({self.output[0]: answers[0]})
76+
return state

0 commit comments

Comments
 (0)