Skip to content

Commit e981796

Browse files
committed
docs: base and fetch node
1 parent 5ae67f5 commit e981796

File tree

2 files changed

+81
-62
lines changed

2 files changed

+81
-62
lines changed

scrapegraphai/nodes/base_node.py

Lines changed: 70 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,51 @@
11
"""
2-
Module for creating the basic node
2+
Module for defining BaseNode, an abstract base class for nodes in a graph-based workflow.
33
"""
4+
45
from abc import ABC, abstractmethod
56
from typing import Optional, List
67
import re
78

89

910
class BaseNode(ABC):
1011
"""
11-
An abstract base class for nodes in a graph-based workflow. Each node is
12-
intended to perform a specific action when executed as part of the graph's
13-
processing flow.
12+
An abstract base class for nodes in a graph-based workflow, designed to perform specific actions when executed.
1413
1514
Attributes:
16-
node_name (str): A unique identifier for the node.
17-
node_type (str): Specifies the node's type, which influences how the
18-
node interacts within the graph. Valid values are
19-
"node" for standard nodes and "conditional_node" for
20-
nodes that determine the flow based on conditions.
21-
22-
Methods:
23-
execute(state): An abstract method that subclasses must implement. This
24-
method should contain the logic that the node executes
25-
when it is reached in the graph's flow. It takes the
26-
graph's current state as input and returns the updated
27-
state after execution.
28-
29-
Args:
30-
node_name (str): The unique identifier name for the node. This name is
31-
used to reference the node within the graph.
32-
node_type (str): The type of the node, limited to "node" or
33-
"conditional_node". This categorization helps in
34-
determining the node's role and behavior within the
35-
graph.
36-
37-
Raises:
38-
ValueError: If the provided `node_type` is not one of the allowed
39-
values ("node" or "conditional_node"), a ValueError is
40-
raised to indicate the incorrect usage.
15+
node_name (str): The unique identifier name for the node.
16+
input (str): Boolean expression defining the input keys needed from the state.
17+
output (List[str]): List of
18+
min_input_len (int): Minimum required number of input keys.
19+
node_config (Optional[dict]): Additional configuration for the node.
20+
21+
Example:
22+
>>> class MyNode(BaseNode):
23+
... def execute(self, state):
24+
... # Implementation of node logic here
25+
... return state
26+
...
27+
>>> my_node = MyNode("ExampleNode", "node", "input_spec", ["output_spec"])
28+
>>> updated_state = my_node.execute({'key': 'value'})
29+
{'key': 'value'}
4130
"""
4231

4332
def __init__(self, node_name: str, node_type: str, input: str, output: List[str],
4433
min_input_len: int = 1, node_config: Optional[dict] = None):
4534
"""
46-
Initialize the node with a unique identifier and a specified node type.
35+
Initialize the instance with the node's name, type, input/output specifications, and configuration details.
4736
4837
Args:
49-
node_name (str): The unique identifier name for the node.
50-
node_type (str): The type of the node, limited to "node" or "conditional_node".
38+
node_name (str): Name for identifying the node.
39+
node_type (str): Type of the node; must be 'node' or 'conditional_node'.
40+
input (str): Expression defining the input keys needed from the state.
41+
output (List[str]): List of output keys to be updated in the state.
42+
min_input_len (int, optional): Minimum required number of input keys; defaults to 1.
43+
node_config (Optional[dict], optional): Additional configuration for the node; defaults to None.
5144
5245
Raises:
53-
ValueError: If node_type is not "node" or "conditional_node".
46+
ValueError: If `node_type` is not one of the allowed types.
5447
"""
48+
5549
self.node_name = node_name
5650
self.input = input
5751
self.output = output
@@ -66,17 +60,31 @@ def __init__(self, node_name: str, node_type: str, input: str, output: List[str]
6660
@abstractmethod
6761
def execute(self, state: dict) -> dict:
6862
"""
69-
Execute the node's logic and return the updated state.
63+
Execute the node's logic based on the current state and update it accordingly.
64+
7065
Args:
7166
state (dict): The current state of the graph.
72-
:return: The updated state after executing this node.
67+
68+
Returns:
69+
dict: The updated state after executing the node's logic.
7370
"""
71+
7472
pass
7573

7674
def get_input_keys(self, state: dict) -> List[str]:
77-
"""Use the _parse_input_keys method to identify which state keys are
78-
needed based on the input attribute
7975
"""
76+
Determines the necessary state keys based on the input specification.
77+
78+
Args:
79+
state (dict): The current state of the graph used to parse input keys.
80+
81+
Returns:
82+
List[str]: A list of input keys required for node operation.
83+
84+
Raises:
85+
ValueError: If error occurs in parsing input keys.
86+
"""
87+
8088
try:
8189
input_keys = self._parse_input_keys(state, self.input)
8290
self._validate_input_keys(input_keys)
@@ -86,23 +94,37 @@ def get_input_keys(self, state: dict) -> List[str]:
8694
f"Error parsing input keys for {self.node_name}: {str(e)}")
8795

8896
def _validate_input_keys(self, input_keys):
97+
"""
98+
Validates if the provided input keys meet the minimum length requirement.
99+
100+
Args:
101+
input_keys (List[str]): The list of input keys to validate.
102+
103+
Raises:
104+
ValueError: If the number of input keys is less than the minimum required.
105+
"""
106+
89107
if len(input_keys) < self.min_input_len:
90108
raise ValueError(
91109
f"""{self.node_name} requires at least {self.min_input_len} input keys,
92110
got {len(input_keys)}.""")
93111

94112
def _parse_input_keys(self, state: dict, expression: str) -> List[str]:
95113
"""
96-
Parses the input keys expression and identifies the corresponding keys
97-
from the state that match the expression logic.
114+
Parses the input keys expression to extract relevant keys from the state based on logical conditions.
115+
The expression can contain AND (&), OR (|), and parentheses to group conditions.
98116
99117
Args:
100118
state (dict): The current state of the graph.
101119
expression (str): The input keys expression to parse.
102120
103121
Returns:
104122
List[str]: A list of key names that match the input keys expression logic.
123+
124+
Raises:
125+
ValueError: If the expression is invalid or if no state keys match the expression.
105126
"""
127+
106128
# Check for empty expression
107129
if not expression:
108130
raise ValueError("Empty expression.")
@@ -142,23 +164,30 @@ def _parse_input_keys(self, state: dict, expression: str) -> List[str]:
142164
"Missing or unbalanced parentheses in expression.")
143165

144166
# Helper function to evaluate an expression without parentheses
145-
def evaluate_simple_expression(exp):
167+
def evaluate_simple_expression(exp: str) -> List[str]:
168+
"""Evaluate an expression without parentheses."""
169+
146170
# Split the expression by the OR operator and process each segment
147171
for or_segment in exp.split('|'):
172+
148173
# Check if all elements in an AND segment are in state
149174
and_segment = or_segment.split('&')
150175
if all(elem.strip() in state for elem in and_segment):
151176
return [elem.strip() for elem in and_segment if elem.strip() in state]
152177
return []
153178

154179
# Helper function to evaluate expressions with parentheses
155-
def evaluate_expression(expression):
180+
def evaluate_expression(expression: str) -> List[str]:
181+
"""Evaluate an expression with parentheses."""
182+
156183
while '(' in expression:
157184
start = expression.rfind('(')
158185
end = expression.find(')', start)
159186
sub_exp = expression[start + 1:end]
187+
160188
# Replace the evaluated part with a placeholder and then evaluate it
161189
sub_result = evaluate_simple_expression(sub_exp)
190+
162191
# For simplicity in handling, join sub-results with OR to reprocess them later
163192
expression = expression[:start] + \
164193
'|'.join(sub_result) + expression[end+1:]

scrapegraphai/nodes/fetch_node.py

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,38 +12,28 @@
1212
class FetchNode(BaseNode):
1313
"""
1414
A node responsible for fetching the HTML content of a specified URL and updating
15-
the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous
16-
document loading.
15+
the graph's state with this content. It uses the AsyncChromiumLoader to fetch the
16+
content asynchronously.
1717
1818
This node acts as a starting point in many scraping workflows, preparing the state
1919
with the necessary HTML content for further processing by subsequent nodes in the graph.
2020
2121
Attributes:
22-
node_name (str): The unique identifier name for the node.
23-
node_type (str): The type of the node, defaulting to "node". This categorization
24-
helps in determining the node's role and behavior within the graph.
25-
The "node" type is used for standard operational nodes.
26-
22+
headless (bool): A flag indicating whether the browser should run in headless mode.
23+
verbose (bool): A flag indicating whether to print verbose output during execution.
24+
2725
Args:
28-
node_name (str): The unique identifier name for the node. This name is used to
29-
reference the node within the graph.
30-
node_type (str, optional): The type of the node, limited to "node" or
31-
"conditional_node". Defaults to "node".
26+
input (str): Boolean expression defining the input keys needed from the state.
27+
output (List[str]): List of output keys to be updated in the state.
28+
node_config (Optional[dict]): Additional configuration for the node.
29+
node_name (str): The unique identifier name for the node, defaulting to "Fetch".
3230
3331
Methods:
34-
execute(state): Fetches the HTML content for the URL specified in the state and
35-
updates the state with this content under the 'document' key.
36-
The 'url' key must be present in the state for the operation
37-
to succeed.
32+
execute(state): Fetches the HTML content for the URL specified in the state
33+
and updates the state with the fetched content under the specified output key.
3834
"""
3935

4036
def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
41-
"""
42-
Initializes the FetchHTMLNode with a node name and node type.
43-
Arguments:
44-
node_name (str): name of the node
45-
prox_rotation (bool): if you wamt to rotate proxies
46-
"""
4737
super().__init__(node_name, "node", input, output, 1)
4838

4939
self.headless = True if node_config is None else node_config.get("headless", True)

0 commit comments

Comments
 (0)