Merge pull request #704 from ScrapeGraphAI/refactoring-smart_scraper

VinciGit00 · web-flow · commit 02ec4c11773c · 2024-09-27T16:45:50.000+02:00
feat: add html_mode to smart_scraper
diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py
@@ -0,0 +1,49 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+By default smart scraper converts in md format the 
+code. If you want to just use the original code, you have
+to specify in the confi
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "model": "openai/gpt-4o",
+    },
+    "html_mode": True,
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me what does the company do, the name and a contact email.",
+    source="https://scrapegraphai.com/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -69,14 +69,7 @@ def _create_graph(self) -> BaseGraph:
                 "scrape_do": self.config.get("scrape_do")
             }
         )
-        parse_node = ParseNode(
-            input="doc",
-            output=["parsed_doc"],
-            node_config={
-                "llm_model": self.llm_model,
-                "chunk_size": self.model_token
-            }
-        )
+       
 
         generate_answer_node = GenerateAnswerNode(
             input="user_prompt & (relevant_chunks | parsed_doc | doc)",
@@ -88,19 +81,43 @@ def _create_graph(self) -> BaseGraph:
             }
         )
 
+        if self.config.get("html_mode") is not True:
+
+            parse_node = ParseNode(
+                input="doc",
+                output=["parsed_doc"],
+                node_config={
+                    "llm_model": self.llm_model,
+                    "chunk_size": self.model_token
+                }
+            )
+
+            return BaseGraph(
+                nodes=[
+                    fetch_node,
+                    parse_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node, parse_node),
+                    (parse_node, generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
+
         return BaseGraph(
-            nodes=[
-                fetch_node,
-                parse_node,
-                generate_answer_node,
-            ],
-            edges=[
-                (fetch_node, parse_node),
-                (parse_node, generate_answer_node)
-            ],
-            entry_point=fetch_node,
-            graph_name=self.__class__.__name__
-        )
+                nodes=[
+                    fetch_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node,  generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
+
 
     def run(self) -> str:
         """