fix: fixed bugs for csv and xml

VinciGit00 · VinciGit00 · commit 324e977b853e · 2024-05-09T20:46:46.000+02:00
diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py
@@ -30,7 +30,7 @@ def _create_graph(self):
         Creates the graph of nodes representing the workflow for web scraping.
         """
         fetch_node = FetchNode(
-            input="csv_dir",
+            input="csv",
             output=["doc"],
         )
         parse_node = ParseNode(
@@ -78,4 +78,4 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("answer", "No answer found.")
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py
@@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph:
         """
 
         fetch_node = FetchNode(
-            input="xml_dir",
+            input="xml",
             output=["doc"]
         )
         parse_node = ParseNode(
@@ -108,4 +108,4 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("answer", "No answer found.")
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -1,7 +1,7 @@
 """ 
 FetchNode Module
 """
-
+import pandas as pd
 from typing import List, Optional
 from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.documents import Document
@@ -22,19 +22,21 @@ class FetchNode(BaseNode):
     Attributes:
         headless (bool): A flag indicating whether the browser should run in headless mode.
         verbose (bool): A flag indicating whether to print verbose output during execution.
-    
+
     Args:
         input (str): Boolean expression defining the input keys needed from the state.
         output (List[str]): List of output keys to be updated in the state.
         node_config (Optional[dict]): Additional configuration for the node.
         node_name (str): The unique identifier name for the node, defaulting to "Fetch".
     """
 
-    def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"):
+    def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
         super().__init__(node_name, "node", input, output, 1)
 
-        self.headless = True if node_config is None else node_config.get("headless", True)
-        self.verbose = False if node_config is None else node_config.get("verbose", False)
+        self.headless = True if node_config is None else node_config.get(
+            "headless", True)
+        self.verbose = False if node_config is None else node_config.get(
+            "verbose", False)
 
     def execute(self, state):
         """
@@ -72,6 +74,16 @@ def execute(self, state):
             loader = PyPDFLoader(source)
             compressed_document = loader.load()
 
+        elif self.input == "csv":
+            compressed_document = [Document(page_content=pd.read_csv(source), metadata={
+                "source": "xml"
+            })]
+        elif self.input == "xml":
+            with open(source, 'r', encoding='utf-8') as f:
+                data = f.read()
+            compressed_document = [Document(page_content=data, metadata={
+                "source": "xml"
+            })]
         elif self.input == "pdf_dir":
             pass
 
@@ -82,7 +94,7 @@ def execute(self, state):
 
         else:
             if self.node_config is not None and self.node_config.get("endpoint") is not None:
-                
+
                 loader = AsyncChromiumLoader(
                     [source],
                     proxies={"http": self.node_config["endpoint"]},