Skip to content

Commit cf038b3

Browse files
committed
docs: update utils docstrings
1 parent 96975b2 commit cf038b3

File tree

9 files changed

+126
-58
lines changed

9 files changed

+126
-58
lines changed

scrapegraphai/utils/convert_to_csv.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,27 @@
66
import pandas as pd
77

88

9-
def convert_to_csv(data: dict, filename: str, position: str = None):
9+
def convert_to_csv(data: dict, filename: str, position: str = None) -> None:
1010
"""
11-
Converts a dictionary to a CSV file and saves it.
11+
Converts a dictionary to a CSV file and saves it at a specified location.
1212
1313
Args:
14-
data (dict): Data to be converted to CSV.
15-
position (str): Optional path where the file should be saved. If not provided,
16-
the directory of the caller script will be used.
14+
data (dict): The data to be converted into CSV format.
15+
filename (str): The name of the output CSV file, without the '.csv' extension.
16+
position (str, optional): The file path where the CSV should be saved. Defaults to the directory of the caller script if not provided.
1717
18+
Returns:
19+
None: The function does not return anything.
20+
1821
Raises:
19-
FileNotFoundError: If the specified directory does not exist.
20-
PermissionError: If the program lacks write permission for the directory.
21-
TypeError: If the input data is not a dictionary.
22-
Exception: For other potential errors during DataFrame creation or CSV saving.
22+
FileNotFoundError: If the specified directory does not exist.
23+
PermissionError: If write permissions are lacking for the directory.
24+
TypeError: If `data` is not a dictionary.
25+
Exception: For other issues that may arise during the creation or saving of the CSV file.
26+
27+
Example:
28+
>>> convert_to_csv({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save')
29+
Saves a CSV file named 'output.csv' at '/path/to/save'.
2330
"""
2431

2532
if ".csv" in filename:

scrapegraphai/utils/convert_to_json.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,33 @@
66
import sys
77

88

9-
def convert_to_json(data: dict, filename: str, position: str = None):
9+
def convert_to_json(data: dict, filename: str, position: str = None) -> None:
1010
"""
11-
Convert data to JSON format and save it to a file.
11+
Converts a dictionary to a JSON file and saves it at a specified location.
1212
1313
Args:
14-
data (dict): Data to save.
15-
filename (str): Name of the file to save without .json extension.
16-
position (str): Directory where the file should be saved. If None,
17-
the directory of the caller script will be used.
14+
data (dict): The data to be converted into JSON format.
15+
filename (str): The name of the output JSON file, without the '.json' extension.
16+
position (str, optional): The file path where the JSON file should be saved. Defaults to the directory of the caller script if not provided.
1817
18+
Returns:
19+
None: The function does not return anything.
20+
1921
Raises:
20-
ValueError: If filename contains '.json'.
21-
FileNotFoundError: If the specified directory does not exist.
22-
PermissionError: If the program does not have permission to write to the directory.
22+
ValueError: If 'filename' contains '.json'.
23+
FileNotFoundError: If the specified directory does not exist.
24+
PermissionError: If write permissions are lacking for the directory.
25+
26+
Example:
27+
>>> convert_to_json({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save')
28+
Saves a JSON file named 'output.json' at '/path/to/save'.
29+
30+
Notes:
31+
This function automatically ensures the directory exists before attempting to write the file. If the directory does not exist, it will attempt to create it.
2332
"""
33+
2434
if ".json" in filename:
25-
filename = filename.replace(".json", "") # Remove .csv extension
35+
filename = filename.replace(".json", "") # Remove .json extension
2636

2737
# Get the directory of the caller script
2838
if position is None:

scrapegraphai/utils/parse_state_keys.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,30 @@
44
import re
55

66

7-
def parse_expression(expression, state: dict):
8-
"""
9-
Function for parsing the expressions
7+
def parse_expression(expression, state: dict) -> list:
8+
"""
9+
Parses a complex boolean expression involving state keys.
10+
1011
Args:
11-
state (dict): state to elaborate
12+
expression (str): The boolean expression to parse.
13+
state (dict): Dictionary of state keys used to evaluate the expression.
14+
15+
Raises:
16+
ValueError: If the expression is empty, has adjacent state keys without operators, invalid operator usage,
17+
unbalanced parentheses, or if no state keys match the expression.
18+
19+
Returns:
20+
list: A list of state keys that match the boolean expression, ensuring each key appears only once.
21+
22+
Example:
23+
>>> parse_expression("user_input & (relevant_chunks | parsed_document | document)",
24+
{"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None})
25+
['user_input', 'relevant_chunks', 'parsed_document', 'document']
26+
27+
This function evaluates the expression to determine the logical inclusion of state keys based on provided boolean logic.
28+
It checks for syntax errors such as unbalanced parentheses, incorrect adjacency of operators, and empty expressions.
1229
"""
30+
1331
# Check for empty expression
1432
if not expression:
1533
raise ValueError("Empty expression.")

scrapegraphai/utils/prettify_exec_info.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,17 @@
77

88
def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame:
99
"""
10-
Transform the execution information of the graph into a DataFrame for better visualization.
10+
Transforms the execution information of a graph into a DataFrame for enhanced visualization.
1111
1212
Args:
13-
- complete_result (list[dict]): The complete execution information of the graph.
13+
complete_result (list[dict]): The complete execution information of the graph.
1414
1515
Returns:
16-
- pd.DataFrame: The execution information of the graph in a DataFrame.
16+
pd.DataFrame: A DataFrame that organizes the execution information for better readability and analysis.
17+
18+
Example:
19+
>>> prettify_exec_info([{'node': 'A', 'status': 'success'}, {'node': 'B', 'status': 'failure'}])
20+
DataFrame with columns 'node' and 'status' showing execution results for each node.
1721
"""
1822

1923
df_nodes = pd.DataFrame(complete_result)

scrapegraphai/utils/proxy_rotation.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,29 @@
44
from fp.fp import FreeProxy
55

66

7-
def proxy_generator(num_ips: int):
7+
def proxy_generator(num_ips: int) -> list:
88
"""
9-
Rotates through a specified number of proxy IPs using the FreeProxy library.
9+
Generates a specified number of proxy IP addresses using the FreeProxy library.
1010
1111
Args:
12-
num_ips (int): The number of proxy IPs to rotate through.
12+
num_ips (int): The number of proxy IPs to generate and rotate through.
1313
1414
Returns:
15-
dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation.
15+
list: A list of proxy IP addresses.
1616
1717
Example:
1818
>>> proxy_generator(5)
19-
{
20-
0: '192.168.1.1:8080',
21-
1: '103.10.63.135:8080',
22-
2: '176.9.75.42:8080',
23-
3: '37.57.216.2:8080',
24-
4: '113.20.31.250:8080'
25-
}
19+
[
20+
'192.168.1.1:8080',
21+
'103.10.63.135:8080',
22+
'176.9.75.42:8080',
23+
'37.57.216.2:8080',
24+
'113.20.31.250:8080'
25+
]
26+
27+
This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations.
2628
"""
29+
2730
res = []
2831

2932
for i in range(0, num_ips):

scrapegraphai/utils/remover.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,20 @@
77

88
def remover(html_content: str) -> str:
99
"""
10-
This function processes HTML content, removes unnecessary tags
11-
(including style tags), minifies the HTML, and retrieves the
12-
title and body content.
10+
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
1311
14-
Parameters:
15-
html_content (str): The HTML content to parse
12+
Args:
13+
html_content (str): The HTML content to be processed.
1614
1715
Returns:
18-
str: The parsed title followed by the minified body content
16+
str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so.
17+
18+
Example:
19+
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
20+
>>> remover(html_content)
21+
'Title: Example, Body: <body><p>Hello World!</p></body>'
22+
23+
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
1924
"""
2025

2126
soup = BeautifulSoup(html_content, 'html.parser')

scrapegraphai/utils/research_web.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,25 @@
88

99

1010
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
11-
"""
12-
Function that given a query it finds it on the intenet
11+
"""
12+
Searches the web for a given query using specified search engine options.
13+
1314
Args:
14-
query (str): query to search on internet
15-
search_engine (str, optional): type of browser, it could be DuckDuckGo or Google,
16-
default: Google
17-
max_results (int, optional): maximum number of results
15+
query (str): The search query to find on the internet.
16+
search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
17+
max_results (int, optional): The maximum number of search results to return.
1818
1919
Returns:
20-
List[str]: List of strings of web link
20+
List[str]: A list of URLs as strings that are the search results.
21+
22+
Raises:
23+
ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
24+
25+
Example:
26+
>>> search_on_web("example query", search_engine="Google", max_results=5)
27+
['http://example.com', 'http://example.org', ...]
28+
29+
This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
2130
"""
2231

2332
if search_engine == "Google":

scrapegraphai/utils/save_audio_from_bytes.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,18 @@
77

88
def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -> None:
99
"""
10-
Saves the byte response as an audio file.
10+
Saves the byte response as an audio file to the specified path.
1111
1212
Args:
13-
byte_response (bytes): The byte response containing the generated speech.
14-
output_path (str or Path): The file path where the generated speech should be saved.
13+
byte_response (bytes): The byte array containing audio data.
14+
output_path (Union[str, Path]): The destination file path where the audio file will be saved.
15+
16+
Example:
17+
>>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3')
18+
19+
This function writes the byte array containing audio data to a file, saving it as an audio file.
1520
"""
21+
1622
if not isinstance(output_path, Path):
1723
output_path = Path(output_path)
1824

scrapegraphai/utils/token_calculator.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,21 @@
88

99
def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
1010
"""
11-
It creates a list of strings to create max dimension tokenizable elements
11+
Truncates text into chunks that are small enough to be processed by specified llm models.
1212
1313
Args:
14-
text (str): The input text to be truncated into tokenizable elements.
15-
model (str): The name of the language model to be used.
16-
encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
14+
text (str): The input text to be truncated.
15+
model (str): The name of the llm model to determine the maximum token limit.
16+
encoding_name (str): The encoding strategy used to encode the text before truncation.
1717
1818
Returns:
19-
List[str]: A list of tokenizable elements created from the input text.
19+
List[str]: A list of text chunks, each within the token limit of the specified model.
20+
21+
Example:
22+
>>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING")
23+
["This is a sample text", "for truncation."]
24+
25+
This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit.
2026
"""
2127

2228
encoding = tiktoken.get_encoding(encoding_name)

0 commit comments

Comments
 (0)