Skip to content

Commit 283b61f

Browse files
committed
docs: better logging
1 parent a6757ac commit 283b61f

File tree

5 files changed

+8
-14
lines changed

5 files changed

+8
-14
lines changed

examples/openai/smart_scraper_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"api_key": openai_key,
2222
"model": "gpt-3.5-turbo",
2323
},
24-
"verbose": False,
24+
"verbose": True,
2525
"headless": False,
2626
}
2727

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,9 @@ def _create_graph(self) -> BaseGraph:
6666
output=["doc", "link_urls", "img_urls"],
6767
node_config={
6868
"loader_kwargs": self.config.get("loader_kwargs", {}),
69-
"headless": self.config.get("headless", True) # Ensure headless flag is passed
7069
}
7170
)
72-
logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
71+
7372
parse_node = ParseNode(
7473
input="doc",
7574
output=["parsed_doc"],

scrapegraphai/graphs/smart_scraper_multi_graph.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ def __init__(self, prompt: str, source: List[str], config: dict, schema: Optiona
5151
self.copy_config = copy(config)
5252
else:
5353
self.copy_config = deepcopy(config)
54+
55+
self.copy_schema = deepcopy(schema)
5456

5557
super().__init__(prompt, config, source, schema)
5658

@@ -70,6 +72,7 @@ def _create_graph(self) -> BaseGraph:
7072
prompt="",
7173
source="",
7274
config=self.copy_config,
75+
schema=self.copy_schema
7376
)
7477

7578
# ************************************************

scrapegraphai/nodes/fetch_node.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def execute(self, state):
131131
pass
132132

133133
elif not source.startswith("http"):
134-
self.logger.info(f"Fetching local HTML content from: {source}")
134+
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
135135
if not source.strip():
136136
raise ValueError("No HTML body content found in the local source.")
137137
title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
@@ -141,7 +141,7 @@ def execute(self, state):
141141
]
142142

143143
elif self.useSoup:
144-
self.logger.info(f"Fetching HTML content using requests from: {source}")
144+
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
145145
response = requests.get(source)
146146
if response.status_code == 200:
147147
if not response.text.strip():
@@ -157,7 +157,7 @@ def execute(self, state):
157157
)
158158

159159
else:
160-
self.logger.info(f"Fetching HTML content using ChromiumLoader from: {source}")
160+
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
161161
loader_kwargs = {}
162162

163163
if self.node_config is not None:

scrapegraphai/utils/cleanup_html.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from minify_html import minify
66
from urllib.parse import urljoin
77

8-
98
def cleanup_html(html_content: str, base_url: str) -> str:
109
"""
1110
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
@@ -24,12 +23,6 @@ def cleanup_html(html_content: str, base_url: str) -> str:
2423
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
2524
"""
2625

27-
import logging
28-
logging.basicConfig(level=logging.DEBUG)
29-
30-
# Add logging to capture the HTML content before parsing
31-
logging.debug(f'HTML content before parsing: {html_content}')
32-
3326
soup = BeautifulSoup(html_content, 'html.parser')
3427

3528
# Title Extraction
@@ -62,6 +55,5 @@ def cleanup_html(html_content: str, base_url: str) -> str:
6255
return title, minimized_body, link_urls, image_urls
6356

6457
else:
65-
logging.error(f'No body content found in HTML: {html_content}')
6658
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")
6759

0 commit comments

Comments
 (0)