Skip to content

Commit 324e977

Browse files
committed
fix: fixed bugs for csv and xml
1 parent 84e8d12 commit 324e977

File tree

3 files changed

+22
-10
lines changed

3 files changed

+22
-10
lines changed

scrapegraphai/graphs/csv_scraper_graph.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def _create_graph(self):
3030
Creates the graph of nodes representing the workflow for web scraping.
3131
"""
3232
fetch_node = FetchNode(
33-
input="csv_dir",
33+
input="csv",
3434
output=["doc"],
3535
)
3636
parse_node = ParseNode(
@@ -78,4 +78,4 @@ def run(self) -> str:
7878
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
7979
self.final_state, self.execution_info = self.graph.execute(inputs)
8080

81-
return self.final_state.get("answer", "No answer found.")
81+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/graphs/xml_scraper_graph.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph:
5656
"""
5757

5858
fetch_node = FetchNode(
59-
input="xml_dir",
59+
input="xml",
6060
output=["doc"]
6161
)
6262
parse_node = ParseNode(
@@ -108,4 +108,4 @@ def run(self) -> str:
108108
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
109109
self.final_state, self.execution_info = self.graph.execute(inputs)
110110

111-
return self.final_state.get("answer", "No answer found.")
111+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/nodes/fetch_node.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
FetchNode Module
33
"""
4-
4+
import pandas as pd
55
from typing import List, Optional
66
from langchain_community.document_loaders import AsyncChromiumLoader
77
from langchain_core.documents import Document
@@ -22,19 +22,21 @@ class FetchNode(BaseNode):
2222
Attributes:
2323
headless (bool): A flag indicating whether the browser should run in headless mode.
2424
verbose (bool): A flag indicating whether to print verbose output during execution.
25-
25+
2626
Args:
2727
input (str): Boolean expression defining the input keys needed from the state.
2828
output (List[str]): List of output keys to be updated in the state.
2929
node_config (Optional[dict]): Additional configuration for the node.
3030
node_name (str): The unique identifier name for the node, defaulting to "Fetch".
3131
"""
3232

33-
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"):
33+
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
3434
super().__init__(node_name, "node", input, output, 1)
3535

36-
self.headless = True if node_config is None else node_config.get("headless", True)
37-
self.verbose = False if node_config is None else node_config.get("verbose", False)
36+
self.headless = True if node_config is None else node_config.get(
37+
"headless", True)
38+
self.verbose = False if node_config is None else node_config.get(
39+
"verbose", False)
3840

3941
def execute(self, state):
4042
"""
@@ -72,6 +74,16 @@ def execute(self, state):
7274
loader = PyPDFLoader(source)
7375
compressed_document = loader.load()
7476

77+
elif self.input == "csv":
78+
compressed_document = [Document(page_content=pd.read_csv(source), metadata={
79+
"source": "xml"
80+
})]
81+
elif self.input == "xml":
82+
with open(source, 'r', encoding='utf-8') as f:
83+
data = f.read()
84+
compressed_document = [Document(page_content=data, metadata={
85+
"source": "xml"
86+
})]
7587
elif self.input == "pdf_dir":
7688
pass
7789

@@ -82,7 +94,7 @@ def execute(self, state):
8294

8395
else:
8496
if self.node_config is not None and self.node_config.get("endpoint") is not None:
85-
97+
8698
loader = AsyncChromiumLoader(
8799
[source],
88100
proxies={"http": self.node_config["endpoint"]},

0 commit comments

Comments
 (0)