Skip to content

Commit 12b5ead

Browse files
authored
fix(md_conversion): add absolute links md, added missing dependency
1 parent 1756e85 commit 12b5ead

File tree

6 files changed

+21
-7
lines changed

6 files changed

+21
-7
lines changed

examples/openai/smart_scraper_openai.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
# ************************************************
2828

2929
smart_scraper_graph = SmartScraperGraph(
30-
prompt="Extract me the python code inside the page",
31-
source="https://www.exploit-db.com/exploits/51447",
30+
prompt="List me what does the company do, the name and a contact email.",
31+
source="https://scrapegraphai.com/",
3232
config=graph_config
3333
)
3434

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ authors = [
1414
]
1515
dependencies = [
1616
"langchain>=0.2.10",
17+
"langchain-fireworks>=0.1.3",
18+
"langchain_community>=0.2.9",
1719
"langchain-google-genai>=1.0.7",
1820
"langchain-google-vertexai",
1921
"langchain-openai>=0.1.17",
@@ -36,7 +38,6 @@ dependencies = [
3638
"undetected-playwright>=0.3.0",
3739
"semchunk>=1.0.1",
3840
"html2text>=2024.2.26",
39-
"langchain-fireworks>=0.1.3",
4041
]
4142

4243
license = "MIT"

scrapegraphai/graphs/base_graph.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
220220
# Log the graph execution telemetry
221221
graph_execution_time = time.time() - start_time
222222
response = state.get("answer", None) if source_type == "url" else None
223+
content = state.get("parsed_doc", None) if response is not None else None
224+
223225
log_graph_execution(
224226
graph_name=self.graph_name,
225227
source=source,
@@ -228,6 +230,7 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
228230
llm_model=llm_model,
229231
embedder_model=embedder_model,
230232
source_type=source_type,
233+
content=content,
231234
response=response,
232235
execution_time=graph_execution_time,
233236
total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,

scrapegraphai/nodes/fetch_node.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def execute(self, state):
185185
parsed_content = cleanup_html(response, source)
186186

187187
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
188-
parsed_content = convert_to_md(source)
188+
parsed_content = convert_to_md(source, input_data[0])
189189
compressed_document = [Document(page_content=parsed_content)]
190190
else:
191191
self.logger.warning(
@@ -207,7 +207,8 @@ def execute(self, state):
207207
parsed_content = document[0].page_content
208208

209209
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
210-
parsed_content = convert_to_md(document[0].page_content)
210+
211+
parsed_content = convert_to_md(document[0].page_content, input_data[0])
211212

212213

213214
compressed_document = [

scrapegraphai/telemetry/telemetry.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def log_event(event: str, properties: Dict[str, any]):
156156
send_event_json(event_json)
157157

158158

159-
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
159+
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, content: str = None, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
160160
properties = {
161161
"graph_name": graph_name,
162162
"source": source,
@@ -165,11 +165,13 @@ def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, l
165165
"llm_model": llm_model,
166166
"embedder_model": embedder_model,
167167
"source_type": source_type,
168+
"content": content,
168169
"response": response,
169170
"execution_time": execution_time,
170171
"error_node": error_node,
171172
"exception": exception,
172173
"total_tokens": total_tokens,
174+
"type": "community-library"
173175
}
174176
log_event("graph_execution", properties)
175177

scrapegraphai/utils/convert_to_md.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
convert_to_md modul
33
"""
44
import html2text
5+
from urllib.parse import urlparse
56

6-
def convert_to_md(html):
7+
def convert_to_md(html: str, url: str = None) -> str:
78
""" Convert HTML to Markdown.
89
This function uses the html2text library to convert the provided HTML content to Markdown
910
format.
@@ -18,6 +19,12 @@ def convert_to_md(html):
1819
'This is a paragraph.\n\n# This is a heading.'
1920
2021
Note: All the styles and links are ignored during the conversion. """
22+
23+
if url:
24+
parsed_url = urlparse(url)
25+
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
2126
h = html2text.HTML2Text()
2227
h.ignore_links = False
28+
h.baseurl = domain
29+
h.body_width = 0
2330
return h.handle(html)

0 commit comments

Comments
 (0)