fix(md_conversion): add absolute links md, added missing dependency

PeriniM · web-flow · commit 12b5eada6ea7 · 2024-07-23T15:34:12.000+02:00
diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py
@@ -27,8 +27,8 @@
 # ************************************************
 
 smart_scraper_graph = SmartScraperGraph(
-    prompt="Extract me the python code inside the page",
-    source="https://www.exploit-db.com/exploits/51447",
+    prompt="List me what does the company do, the name and a contact email.",
+    source="https://scrapegraphai.com/",
     config=graph_config
 )
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,8 @@ authors = [
 ]
 dependencies = [
     "langchain>=0.2.10",
+    "langchain-fireworks>=0.1.3",
+    "langchain_community>=0.2.9",
     "langchain-google-genai>=1.0.7",
     "langchain-google-vertexai",
     "langchain-openai>=0.1.17",
@@ -36,7 +38,6 @@ dependencies = [
     "undetected-playwright>=0.3.0",
     "semchunk>=1.0.1",
     "html2text>=2024.2.26",
-    "langchain-fireworks>=0.1.3",
 ]
 
 license = "MIT"
diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py
@@ -220,6 +220,8 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
         # Log the graph execution telemetry
         graph_execution_time = time.time() - start_time
         response = state.get("answer", None) if source_type == "url" else None
+        content = state.get("parsed_doc", None) if response is not None else None
+        
         log_graph_execution(
             graph_name=self.graph_name,
             source=source,
@@ -228,6 +230,7 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
             llm_model=llm_model,
             embedder_model=embedder_model,
             source_type=source_type,
+            content=content,
             response=response,
             execution_time=graph_execution_time,
             total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -185,7 +185,7 @@ def execute(self, state):
                     parsed_content = cleanup_html(response, source)
 
                 if  (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
-                    parsed_content = convert_to_md(source)
+                    parsed_content = convert_to_md(source, input_data[0])
                 compressed_document = [Document(page_content=parsed_content)]
             else:
                 self.logger.warning(
@@ -207,7 +207,8 @@ def execute(self, state):
             parsed_content = document[0].page_content
 
             if  isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
-                parsed_content = convert_to_md(document[0].page_content)
+
+                parsed_content = convert_to_md(document[0].page_content, input_data[0])
 
 
             compressed_document = [
diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py
@@ -156,7 +156,7 @@ def log_event(event: str, properties: Dict[str, any]):
         send_event_json(event_json)
 
 
-def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
+def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, content: str = None, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
     properties = {
         "graph_name": graph_name,
         "source": source,
@@ -165,11 +165,13 @@ def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, l
         "llm_model": llm_model,
         "embedder_model": embedder_model,
         "source_type": source_type,
+        "content": content,
         "response": response,
         "execution_time": execution_time,
         "error_node": error_node,
         "exception": exception,
         "total_tokens": total_tokens,
+        "type": "community-library"
     }
     log_event("graph_execution", properties)
 
diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py
@@ -2,8 +2,9 @@
 convert_to_md modul
 """
 import html2text
+from urllib.parse import urlparse
 
-def convert_to_md(html):
+def convert_to_md(html: str, url: str = None) -> str:
     """ Convert HTML to Markdown.
     This function uses the html2text library to convert the provided HTML content to Markdown 
     format.
@@ -18,6 +19,12 @@ def convert_to_md(html):
     'This is a paragraph.\n\n# This is a heading.'
 
     Note: All the styles and links are ignored during the conversion. """
+
+    if url:
+        parsed_url = urlparse(url)
+        domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
     h = html2text.HTML2Text()
     h.ignore_links = False
+    h.baseurl = domain
+    h.body_width = 0
     return h.handle(html)

Original file line number	Diff line number	Diff line change
`@@ -27,8 +27,8 @@`
`27`	`27`	`# ************************************************`
`28`	`28`
`29`	`29`	`smart_scraper_graph = SmartScraperGraph(`
`30`		`- prompt="Extract me the python code inside the page",`
`31`		`- source="https://www.exploit-db.com/exploits/51447",`
	`30`	`+ prompt="List me what does the company do, the name and a contact email.",`
	`31`	`+ source="https://scrapegraphai.com/",`
`32`	`32`	`config=graph_config`
`33`	`33`	`)`
`34`	`34`