Skip to content

Commit c251cc4

Browse files
committed
fix(node-logging): use centralized logger in each node for logging
1 parent 4348d4f commit c251cc4

18 files changed

+406
-242
lines changed

scrapegraphai/nodes/blocks_identifier.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,44 @@
33
"""
44

55
from typing import List, Optional
6+
67
from langchain_community.document_loaders import AsyncChromiumLoader
78
from langchain_core.documents import Document
8-
from .base_node import BaseNode
99

10+
from .base_node import BaseNode
1011

1112

1213
class BlocksIndentifier(BaseNode):
1314
"""
1415
A node responsible to identify the blocks in the HTML content of a specified HTML content
15-
e.g products in a E-commerce, flights in a travel website etc.
16+
e.g products in a E-commerce, flights in a travel website etc.
1617
1718
Attributes:
1819
headless (bool): A flag indicating whether the browser should run in headless mode.
1920
verbose (bool): A flag indicating whether to print verbose output during execution.
20-
21+
2122
Args:
2223
input (str): Boolean expression defining the input keys needed from the state.
2324
output (List[str]): List of output keys to be updated in the state.
2425
node_config (Optional[dict]): Additional configuration for the node.
2526
node_name (str): The unique identifier name for the node, defaulting to "BlocksIndentifier".
2627
"""
2728

28-
def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "BlocksIndentifier"):
29+
def __init__(
30+
self,
31+
input: str,
32+
output: List[str],
33+
node_config: Optional[dict],
34+
node_name: str = "BlocksIndentifier",
35+
):
2936
super().__init__(node_name, "node", input, output, 1)
3037

31-
self.headless = True if node_config is None else node_config.get("headless", True)
32-
self.verbose = True if node_config is None else node_config.get("verbose", False)
38+
self.headless = (
39+
True if node_config is None else node_config.get("headless", True)
40+
)
41+
self.verbose = (
42+
True if node_config is None else node_config.get("verbose", False)
43+
)
3344

3445
def execute(self, state):
3546
"""
@@ -47,8 +58,7 @@ def execute(self, state):
4758
KeyError: If the input key is not found in the state, indicating that the
4859
necessary information to perform the operation is missing.
4960
"""
50-
if self.verbose:
51-
print(f"--- Executing {self.node_name} Node ---")
61+
self.logger.info(f"--- Executing {self.node_name} Node ---")
5262

5363
# Interpret input keys based on the provided input expression
5464
input_keys = self.get_input_keys(state)

scrapegraphai/nodes/fetch_node.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@
33
"""
44

55
import json
6-
import requests
76
from typing import List, Optional
87

98
import pandas as pd
9+
import requests
1010
from langchain_community.document_loaders import PyPDFLoader
1111
from langchain_core.documents import Document
1212

1313
from ..docloaders import ChromiumLoader
14-
from .base_node import BaseNode
1514
from ..utils.cleanup_html import cleanup_html
1615
from ..utils.logging import get_logger
16+
from .base_node import BaseNode
17+
1718

1819
class FetchNode(BaseNode):
1920
"""
@@ -51,7 +52,7 @@ def __init__(
5152
False if node_config is None else node_config.get("verbose", False)
5253
)
5354
self.useSoup = (
54-
False if node_config is None else node_config.get("useSoup", False)
55+
False if node_config is None else node_config.get("useSoup", False)
5556
)
5657
self.loader_kwargs = (
5758
{} if node_config is None else node_config.get("loader_kwargs", {})
@@ -73,8 +74,8 @@ def execute(self, state):
7374
KeyError: If the input key is not found in the state, indicating that the
7475
necessary information to perform the operation is missing.
7576
"""
76-
77-
logger.info(f"--- Executing {self.node_name} Node ---")
77+
78+
self.logger.info(f"--- Executing {self.node_name} Node ---")
7879

7980
# Interpret input keys based on the provided input expression
8081
input_keys = self.get_input_keys(state)
@@ -92,7 +93,7 @@ def execute(self, state):
9293
]
9394
state.update({self.output[0]: compressed_document})
9495
return state
95-
96+
9697
# handling for pdf
9798
elif input_keys[0] == "pdf":
9899
loader = PyPDFLoader(source)
@@ -108,15 +109,15 @@ def execute(self, state):
108109
]
109110
state.update({self.output[0]: compressed_document})
110111
return state
111-
112+
112113
elif input_keys[0] == "json":
113114
f = open(source)
114115
compressed_document = [
115116
Document(page_content=str(json.load(f)), metadata={"source": "json"})
116117
]
117118
state.update({self.output[0]: compressed_document})
118119
return state
119-
120+
120121
elif input_keys[0] == "xml":
121122
with open(source, "r", encoding="utf-8") as f:
122123
data = f.read()
@@ -125,25 +126,29 @@ def execute(self, state):
125126
]
126127
state.update({self.output[0]: compressed_document})
127128
return state
128-
129+
129130
elif self.input == "pdf_dir":
130131
pass
131132

132133
elif not source.startswith("http"):
133134
title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
134135
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
135-
compressed_document = [Document(page_content=parsed_content,
136-
metadata={"source": "local_dir"}
137-
)]
138-
136+
compressed_document = [
137+
Document(page_content=parsed_content, metadata={"source": "local_dir"})
138+
]
139+
139140
elif self.useSoup:
140141
response = requests.get(source)
141142
if response.status_code == 200:
142-
title, minimized_body, link_urls, image_urls = cleanup_html(response.text, source)
143+
title, minimized_body, link_urls, image_urls = cleanup_html(
144+
response.text, source
145+
)
143146
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
144147
compressed_document = [Document(page_content=parsed_content)]
145-
else:
146-
self.logger.warning(f"Failed to retrieve contents from the webpage at url: {source}")
148+
else:
149+
self.logger.warning(
150+
f"Failed to retrieve contents from the webpage at url: {source}"
151+
)
147152

148153
else:
149154
loader_kwargs = {}
@@ -153,14 +158,22 @@ def execute(self, state):
153158

154159
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
155160
document = loader.load()
156-
157-
title, minimized_body, link_urls, image_urls = cleanup_html(str(document[0].page_content), source)
161+
162+
title, minimized_body, link_urls, image_urls = cleanup_html(
163+
str(document[0].page_content), source
164+
)
158165
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
159-
166+
160167
compressed_document = [
161168
Document(page_content=parsed_content, metadata={"source": source})
162169
]
163170

164-
state.update({self.output[0]: compressed_document, self.output[1]: link_urls, self.output[2]: image_urls})
171+
state.update(
172+
{
173+
self.output[0]: compressed_document,
174+
self.output[1]: link_urls,
175+
self.output[2]: image_urls,
176+
}
177+
)
165178

166179
return state

scrapegraphai/nodes/generate_answer_csv_node.py

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,16 @@
22
gg
33
Module for generating the answer node
44
"""
5+
56
# Imports from standard library
67
from typing import List, Optional
7-
from tqdm import tqdm
88

99
# Imports from Langchain
1010
from langchain.prompts import PromptTemplate
1111
from langchain_core.output_parsers import JsonOutputParser
1212
from langchain_core.runnables import RunnableParallel
13+
from tqdm import tqdm
14+
1315
from ..utils.logging import get_logger
1416

1517
# Imports from the library
@@ -25,24 +27,29 @@ class GenerateAnswerCSVNode(BaseNode):
2527
2628
Attributes:
2729
llm_model: An instance of a language model client, configured for generating answers.
28-
node_name (str): The unique identifier name for the node, defaulting
30+
node_name (str): The unique identifier name for the node, defaulting
2931
to "GenerateAnswerNodeCsv".
30-
node_type (str): The type of the node, set to "node" indicating a
32+
node_type (str): The type of the node, set to "node" indicating a
3133
standard operational node.
3234
3335
Args:
34-
llm_model: An instance of the language model client (e.g., ChatOpenAI) used
36+
llm_model: An instance of the language model client (e.g., ChatOpenAI) used
3537
for generating answers.
36-
node_name (str, optional): The unique identifier name for the node.
38+
node_name (str, optional): The unique identifier name for the node.
3739
Defaults to "GenerateAnswerNodeCsv".
3840
3941
Methods:
4042
execute(state): Processes the input and document from the state to generate an answer,
4143
updating the state with the generated answer under the 'answer' key.
4244
"""
4345

44-
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None,
45-
node_name: str = "GenerateAnswer"):
46+
def __init__(
47+
self,
48+
input: str,
49+
output: List[str],
50+
node_config: Optional[dict] = None,
51+
node_name: str = "GenerateAnswer",
52+
):
4653
"""
4754
Initializes the GenerateAnswerNodeCsv with a language model client and a node name.
4855
Args:
@@ -51,8 +58,9 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict] =
5158
"""
5259
super().__init__(node_name, "node", input, output, 2, node_config)
5360
self.llm_model = node_config["llm_model"]
54-
self.verbose = False if node_config is None else node_config.get(
55-
"verbose", False)
61+
self.verbose = (
62+
False if node_config is None else node_config.get("verbose", False)
63+
)
5664

5765
def execute(self, state):
5866
"""
@@ -73,8 +81,7 @@ def execute(self, state):
7381
that the necessary information for generating an answer is missing.
7482
"""
7583

76-
if self.verbose:
77-
self.logger.info(f"--- Executing {self.node_name} Node ---")
84+
self.logger.info(f"--- Executing {self.node_name} Node ---")
7885

7986
# Interpret input keys based on the provided input expression
8087
input_keys = self.get_input_keys(state)
@@ -122,21 +129,27 @@ def execute(self, state):
122129
chains_dict = {}
123130

124131
# Use tqdm to add progress bar
125-
for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)):
132+
for i, chunk in enumerate(
133+
tqdm(doc, desc="Processing chunks", disable=not self.verbose)
134+
):
126135
if len(doc) == 1:
127136
prompt = PromptTemplate(
128137
template=template_no_chunks,
129138
input_variables=["question"],
130-
partial_variables={"context": chunk.page_content,
131-
"format_instructions": format_instructions},
139+
partial_variables={
140+
"context": chunk.page_content,
141+
"format_instructions": format_instructions,
142+
},
132143
)
133144
else:
134145
prompt = PromptTemplate(
135146
template=template_chunks,
136147
input_variables=["question"],
137-
partial_variables={"context": chunk.page_content,
138-
"chunk_id": i + 1,
139-
"format_instructions": format_instructions},
148+
partial_variables={
149+
"context": chunk.page_content,
150+
"chunk_id": i + 1,
151+
"format_instructions": format_instructions,
152+
},
140153
)
141154

142155
# Dynamically name the chains based on their index
@@ -155,8 +168,7 @@ def execute(self, state):
155168
partial_variables={"format_instructions": format_instructions},
156169
)
157170
merge_chain = merge_prompt | self.llm_model | output_parser
158-
answer = merge_chain.invoke(
159-
{"context": answer, "question": user_prompt})
171+
answer = merge_chain.invoke({"context": answer, "question": user_prompt})
160172
else:
161173
# Chain
162174
single_chain = list(chains_dict.values())[0]

0 commit comments

Comments
 (0)