Skip to content

Commit 0631985

Browse files
committed
docs: graphs and helpers docstrings
1 parent 18c20eb commit 0631985

14 files changed

+305
-81
lines changed

scrapegraphai/builders/graph_builder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Module for making the graph building
2+
GraphBuilder Module
33
"""
44

55
from langchain_core.prompts import ChatPromptTemplate

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
__init__.py file for graphs folder
33
"""
4+
45
from .base_graph import BaseGraph
56
from .smart_scraper_graph import SmartScraperGraph
67
from .speech_graph import SpeechGraph

scrapegraphai/graphs/abstract_graph.py

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
2-
Module having abstract class for creating all the graphs
2+
AbstractGraph Module
33
"""
4+
45
from abc import ABC, abstractmethod
56
from typing import Optional
67
from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq
@@ -9,13 +10,34 @@
910

1011
class AbstractGraph(ABC):
1112
"""
12-
Abstract class representing a generic graph-based tool.
13+
Scaffolding class for creating a graph representation and executing it.
14+
15+
Attributes:
16+
prompt (str): The prompt for the graph.
17+
source (str): The source of the graph.
18+
config (dict): Configuration parameters for the graph.
19+
llm_model: An instance of a language model client, configured for generating answers.
20+
embedder_model: An instance of an embedding model client, configured for generating embeddings.
21+
verbose (bool): A flag indicating whether to show print statements during execution.
22+
headless (bool): A flag indicating whether to run the graph in headless mode.
23+
24+
Args:
25+
prompt (str): The prompt for the graph.
26+
config (dict): Configuration parameters for the graph.
27+
source (str, optional): The source of the graph.
28+
29+
Example:
30+
>>> class MyGraph(AbstractGraph):
31+
... def _create_graph(self):
32+
... # Implementation of graph creation here
33+
... return graph
34+
...
35+
>>> my_graph = MyGraph("Example Graph", {"llm": {"model": "gpt-3.5-turbo"}}, "example_source")
36+
>>> result = my_graph.run()
1337
"""
1438

1539
def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
16-
"""
17-
Initializes the AbstractGraph with a prompt, file source, and configuration.
18-
"""
40+
1941
self.prompt = prompt
2042
self.source = source
2143
self.config = config
@@ -32,10 +54,20 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
3254
self.final_state = None
3355
self.execution_info = None
3456

35-
def _create_llm(self, llm_config: dict):
57+
def _create_llm(self, llm_config: dict) -> object:
3658
"""
37-
Creates an instance of the language model (OpenAI or Gemini) based on configuration.
59+
Create a large language model instance based on the configuration provided.
60+
61+
Args:
62+
llm_config (dict): Configuration parameters for the language model.
63+
64+
Returns:
65+
object: An instance of the language model client.
66+
67+
Raises:
68+
KeyError: If the model is not supported.
3869
"""
70+
3971
llm_defaults = {
4072
"temperature": 0,
4173
"streaming": False
@@ -104,16 +136,27 @@ def _create_llm(self, llm_config: dict):
104136

105137
def get_state(self, key=None) -> dict:
106138
"""""
107-
Obtain the current state
139+
Get the final state of the graph.
140+
141+
Args:
142+
key (str, optional): The key of the final state to retrieve.
143+
144+
Returns:
145+
dict: The final state of the graph.
108146
"""
147+
109148
if key is not None:
110149
return self.final_state[key]
111150
return self.final_state
112151

113152
def get_execution_info(self):
114153
"""
115154
Returns the execution information of the graph.
155+
156+
Returns:
157+
dict: The execution information of the graph.
116158
"""
159+
117160
return self.execution_info
118161

119162
@abstractmethod

scrapegraphai/graphs/base_graph.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
2-
Module for creating the base graphs
3-
"""
2+
BaseGraph Module
3+
"""
4+
45
import time
56
import warnings
67
from langchain_community.callbacks import get_openai_callback
@@ -16,21 +17,33 @@ class BaseGraph:
1617
key-value pair corresponds to the from-node and to-node relationship.
1718
entry_point (str): The name of the entry point node from which the graph execution begins.
1819
19-
Methods:
20-
execute(initial_state): Executes the graph's nodes starting from the entry point and
21-
traverses the graph based on the provided initial state.
22-
2320
Args:
2421
nodes (iterable): An iterable of node instances that will be part of the graph.
2522
edges (iterable): An iterable of tuples where each tuple represents a directed edge
2623
in the graph, defined by a pair of nodes (from_node, to_node).
2724
entry_point (BaseNode): The node instance that represents the entry point of the graph.
25+
26+
Raises:
27+
Warning: If the entry point node is not the first node in the list.
28+
29+
Example:
30+
>>> BaseGraph(
31+
... nodes=[
32+
... fetch_node,
33+
... parse_node,
34+
... rag_node,
35+
... generate_answer_node,
36+
... ],
37+
... edges=[
38+
... (fetch_node, parse_node),
39+
... (parse_node, rag_node),
40+
... (rag_node, generate_answer_node)
41+
... ],
42+
... entry_point=fetch_node
43+
... )
2844
"""
2945

3046
def __init__(self, nodes: list, edges: list, entry_point: str):
31-
"""
32-
Initializes the graph with nodes, edges, and the entry point.
33-
"""
3447

3548
self.nodes = nodes
3649
self.edges = self._create_edges({e for e in edges})
@@ -51,6 +64,7 @@ def _create_edges(self, edges: list) -> dict:
5164
Returns:
5265
dict: A dictionary of edges with the from-node as keys and to-node as values.
5366
"""
67+
5468
edge_dict = {}
5569
for from_node, to_node in edges:
5670
edge_dict[from_node.node_name] = to_node.node_name
@@ -66,8 +80,10 @@ def execute(self, initial_state: dict) -> Tuple[dict, list]:
6680
initial_state (dict): The initial state to pass to the entry point node.
6781
6882
Returns:
69-
dict: The state after execution has completed, which may have been altered by the nodes.
83+
Tuple[dict, list]: A tuple containing the final state of the execution and a list
84+
of execution information for each node.
7085
"""
86+
7187
current_node_name = self.nodes[0]
7288
state = initial_state
7389

scrapegraphai/graphs/json_scraper_graph.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
2-
Module for creating the smart scraper
2+
JSONScraperGraph Module
33
"""
4+
45
from .base_graph import BaseGraph
56
from ..nodes import (
67
FetchNode,
@@ -13,22 +14,44 @@
1314

1415
class JSONScraperGraph(AbstractGraph):
1516
"""
16-
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
17-
information from web pages using a natural language model to interpret and answer prompts.
17+
JSONScraperGraph defines a scraping pipeline for JSON files.
18+
19+
Attributes:
20+
prompt (str): The prompt for the graph.
21+
source (str): The source of the graph.
22+
config (dict): Configuration parameters for the graph.
23+
llm_model: An instance of a language model client, configured for generating answers.
24+
embedder_model: An instance of an embedding model client, configured for generating embeddings.
25+
verbose (bool): A flag indicating whether to show print statements during execution.
26+
headless (bool): A flag indicating whether to run the graph in headless mode.
27+
28+
Args:
29+
prompt (str): The prompt for the graph.
30+
source (str): The source of the graph.
31+
config (dict): Configuration parameters for the graph.
32+
33+
Example:
34+
>>> json_scraper = JSONScraperGraph(
35+
... "List me all the attractions in Chioggia.",
36+
... "data/chioggia.json",
37+
... {"llm": {"model": "gpt-3.5-turbo"}}
38+
... )
39+
>>> result = json_scraper.run()
1840
"""
1941

2042
def __init__(self, prompt: str, source: str, config: dict):
21-
"""
22-
Initializes the JsonScraperGraph with a prompt, source, and configuration.
23-
"""
2443
super().__init__(prompt, config, source)
2544

2645
self.input_key = "json" if source.endswith("json") else "json_dir"
2746

28-
def _create_graph(self):
47+
def _create_graph(self) -> BaseGraph:
2948
"""
3049
Creates the graph of nodes representing the workflow for web scraping.
50+
51+
Returns:
52+
BaseGraph: A graph instance representing the web scraping workflow.
3153
"""
54+
3255
fetch_node = FetchNode(
3356
input="json_dir",
3457
output=["doc"],
@@ -81,7 +104,11 @@ def _create_graph(self):
81104
def run(self) -> str:
82105
"""
83106
Executes the web scraping process and returns the answer to the prompt.
107+
108+
Returns:
109+
str: The answer to the prompt.
84110
"""
111+
85112
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
86113
self.final_state, self.execution_info = self.graph.execute(inputs)
87114

scrapegraphai/graphs/script_creator_graph.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
2-
Module for creating the smart scraper
2+
ScriptCreatorGraph Module
33
"""
4+
45
from .base_graph import BaseGraph
56
from ..nodes import (
67
FetchNode,
@@ -13,24 +14,47 @@
1314

1415
class ScriptCreatorGraph(AbstractGraph):
1516
"""
16-
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
17-
information from web pages using a natural language model to interpret and answer prompts.
17+
ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts.
18+
19+
Attributes:
20+
prompt (str): The prompt for the graph.
21+
source (str): The source of the graph.
22+
config (dict): Configuration parameters for the graph.
23+
llm_model: An instance of a language model client, configured for generating answers.
24+
embedder_model: An instance of an embedding model client, configured for generating embeddings.
25+
verbose (bool): A flag indicating whether to show print statements during execution.
26+
headless (bool): A flag indicating whether to run the graph in headless mode.
27+
model_token (int): The token limit for the language model.
28+
library (str): The library used for web scraping.
29+
30+
Args:
31+
prompt (str): The prompt for the graph.
32+
source (str): The source of the graph.
33+
config (dict): Configuration parameters for the graph.
34+
35+
Example:
36+
>>> script_creator = ScriptCreatorGraph(
37+
... "List me all the attractions in Chioggia.",
38+
... "https://en.wikipedia.org/wiki/Chioggia",
39+
... {"llm": {"model": "gpt-3.5-turbo"}}
40+
... )
41+
>>> result = script_creator.run()
1842
"""
1943

2044
def __init__(self, prompt: str, source: str, config: dict):
21-
"""
22-
Initializes the ScriptCreatorGraph with a prompt, source, and configuration.
23-
"""
24-
self.library = config['library']
25-
2645
super().__init__(prompt, config, source)
2746

2847
self.input_key = "url" if source.startswith("http") else "local_dir"
48+
self.library = config['library']
2949

30-
def _create_graph(self):
50+
def _create_graph(self) -> BaseGraph:
3151
"""
3252
Creates the graph of nodes representing the workflow for web scraping.
53+
54+
Returns:
55+
BaseGraph: A graph instance representing the web scraping workflow.
3356
"""
57+
3458
fetch_node = FetchNode(
3559
input="url | local_dir",
3660
output=["doc"],
@@ -76,7 +100,11 @@ def _create_graph(self):
76100
def run(self) -> str:
77101
"""
78102
Executes the web scraping process and returns the answer to the prompt.
103+
104+
Returns:
105+
str: The answer to the prompt.
79106
"""
107+
80108
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
81109
self.final_state, self.execution_info = self.graph.execute(inputs)
82110

scrapegraphai/graphs/search_graph.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
2-
Module for making the search on the intenet
2+
SearchGraph Module
33
"""
4+
45
from .base_graph import BaseGraph
56
from ..nodes import (
67
SearchInternetNode,
@@ -14,13 +15,37 @@
1415

1516
class SearchGraph(AbstractGraph):
1617
"""
17-
Module for searching info on the internet
18+
SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt.
19+
It only requires a user prompt to search the internet and generate an answer.
20+
21+
Attributes:
22+
prompt (str): The user prompt to search the internet.
23+
llm_model (dict): The configuration for the language model.
24+
embedder_model (dict): The configuration for the embedder model.
25+
headless (bool): A flag to run the browser in headless mode.
26+
verbose (bool): A flag to display the execution information.
27+
model_token (int): The token limit for the language model.
28+
29+
Args:
30+
prompt (str): The user prompt to search the internet.
31+
config (dict): Configuration parameters for the graph.
32+
33+
Example:
34+
>>> search_graph = SearchGraph(
35+
... "What is Chioggia famous for?",
36+
... {"llm": {"model": "gpt-3.5-turbo"}}
37+
... )
38+
>>> result = search_graph.run()
1839
"""
1940

20-
def _create_graph(self):
41+
def _create_graph(self) -> BaseGraph:
2142
"""
2243
Creates the graph of nodes representing the workflow for web scraping and searching.
44+
45+
Returns:
46+
BaseGraph: A graph instance representing the web scraping and searching workflow.
2347
"""
48+
2449
search_internet_node = SearchInternetNode(
2550
input="user_prompt",
2651
output=["url"],
@@ -83,7 +108,11 @@ def _create_graph(self):
83108
def run(self) -> str:
84109
"""
85110
Executes the web scraping and searching process.
111+
112+
Returns:
113+
str: The answer to the prompt.
86114
"""
115+
87116
inputs = {"user_prompt": self.prompt}
88117
self.final_state, self.execution_info = self.graph.execute(inputs)
89118

0 commit comments

Comments
 (0)