Skip to content

Commit b408655

Browse files
committed
feat: add csv scraper and xml scraper multi
1 parent fa9722d commit b408655

File tree

5 files changed

+361
-0
lines changed

5 files changed

+361
-0
lines changed
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents
3+
"""
4+
5+
import os
6+
import pandas as pd
7+
from scrapegraphai.graphs import CSVScraperMultiGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
10+
# ************************************************
11+
# Read the CSV file
12+
# ************************************************
13+
14+
FILE_NAME = "inputs/username.csv"
15+
curr_dir = os.path.dirname(os.path.realpath(__file__))
16+
file_path = os.path.join(curr_dir, FILE_NAME)
17+
18+
text = pd.read_csv(file_path)
19+
20+
# ************************************************
21+
# Define the configuration for the graph
22+
# ************************************************
23+
24+
graph_config = {
25+
"llm": {
26+
"model": "ollama/llama3",
27+
"temperature": 0,
28+
"format": "json", # Ollama needs the format to be specified explicitly
29+
# "model_tokens": 2000, # set context length arbitrarily
30+
"base_url": "http://localhost:11434",
31+
},
32+
"embeddings": {
33+
"model": "ollama/nomic-embed-text",
34+
"temperature": 0,
35+
"base_url": "http://localhost:11434",
36+
},
37+
"verbose": True,
38+
}
39+
40+
# ************************************************
41+
# Create the CSVScraperMultiGraph instance and run it
42+
# ************************************************
43+
44+
csv_scraper_graph = CSVScraperMultiGraph(
45+
prompt="List me all the last names",
46+
source=[str(text), str(text)],
47+
config=graph_config
48+
)
49+
50+
result = csv_scraper_graph.run()
51+
print(result)
52+
53+
# ************************************************
54+
# Get graph execution info
55+
# ************************************************
56+
57+
graph_exec_info = csv_scraper_graph.get_execution_info()
58+
print(prettify_exec_info(graph_exec_info))
59+
60+
# Save to json or csv
61+
convert_to_csv(result, "result")
62+
convert_to_json(result, "result")
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""
2+
Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import XMLScraperMultiGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the XML file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/books.xml"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
26+
graph_config = {
27+
"llm": {
28+
"model": "ollama/llama3",
29+
"temperature": 0,
30+
"format": "json", # Ollama needs the format to be specified explicitly
31+
# "model_tokens": 2000, # set context length arbitrarily
32+
"base_url": "http://localhost:11434",
33+
},
34+
"embeddings": {
35+
"model": "ollama/nomic-embed-text",
36+
"temperature": 0,
37+
"base_url": "http://localhost:11434",
38+
},
39+
"verbose": True,
40+
}
41+
42+
# ************************************************
43+
# Create the XMLScraperMultiGraph instance and run it
44+
# ************************************************
45+
46+
xml_scraper_graph = XMLScraperMultiGraph(
47+
prompt="List me all the authors, title and genres of the books",
48+
source=[text, text], # Pass the content of the file, not the file object
49+
config=graph_config
50+
)
51+
52+
result = xml_scraper_graph.run()
53+
print(result)
54+
55+
# ************************************************
56+
# Get graph execution info
57+
# ************************************************
58+
59+
graph_exec_info = xml_scraper_graph.get_execution_info()
60+
print(prettify_exec_info(graph_exec_info))
61+
62+
# Save to json or csv
63+
convert_to_csv(result, "result")
64+
convert_to_json(result, "result")

scrapegraphai/graphs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,5 @@
1818
from .smart_scraper_multi_graph import SmartScraperMultiGraph
1919
from .pdf_scraper_multi import PdfScraperMultiGraph
2020
from .json_scraper_multi import JSONScraperMultiGraph
21+
from .csv_scraper_graph_multi import CSVScraperMultiGraph
22+
from .xml_scraper_graph_multi import XMLScraperMultiGraph
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""
2+
CSVScraperMultiGraph Module
3+
"""
4+
5+
from copy import copy, deepcopy
6+
from typing import List, Optional
7+
8+
from .base_graph import BaseGraph
9+
from .abstract_graph import AbstractGraph
10+
from .csv_scraper_graph import CSVScraperGraph
11+
12+
from ..nodes import (
13+
GraphIteratorNode,
14+
MergeAnswersNode
15+
)
16+
17+
18+
class CSVScraperMultiGraph(AbstractGraph):
19+
"""
20+
CSVScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
21+
It only requires a user prompt and a list of URLs.
22+
23+
Attributes:
24+
prompt (str): The user prompt to search the internet.
25+
llm_model (dict): The configuration for the language model.
26+
embedder_model (dict): The configuration for the embedder model.
27+
headless (bool): A flag to run the browser in headless mode.
28+
verbose (bool): A flag to display the execution information.
29+
model_token (int): The token limit for the language model.
30+
31+
Args:
32+
prompt (str): The user prompt to search the internet.
33+
source (List[str]): The source of the graph.
34+
config (dict): Configuration parameters for the graph.
35+
schema (Optional[str]): The schema for the graph output.
36+
37+
Example:
38+
>>> search_graph = MultipleSearchGraph(
39+
... "What is Chioggia famous for?",
40+
... {"llm": {"model": "gpt-3.5-turbo"}}
41+
... )
42+
>>> result = search_graph.run()
43+
"""
44+
45+
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
46+
47+
self.max_results = config.get("max_results", 3)
48+
49+
if all(isinstance(value, str) for value in config.values()):
50+
self.copy_config = copy(config)
51+
else:
52+
self.copy_config = deepcopy(config)
53+
54+
super().__init__(prompt, config, source, schema)
55+
56+
def _create_graph(self) -> BaseGraph:
57+
"""
58+
Creates the graph of nodes representing the workflow for web scraping and searching.
59+
60+
Returns:
61+
BaseGraph: A graph instance representing the web scraping and searching workflow.
62+
"""
63+
64+
# ************************************************
65+
# Create a SmartScraperGraph instance
66+
# ************************************************
67+
68+
smart_scraper_instance = CSVScraperGraph(
69+
prompt="",
70+
source="",
71+
config=self.copy_config,
72+
)
73+
74+
# ************************************************
75+
# Define the graph nodes
76+
# ************************************************
77+
78+
graph_iterator_node = GraphIteratorNode(
79+
input="user_prompt & jsons",
80+
output=["results"],
81+
node_config={
82+
"graph_instance": smart_scraper_instance,
83+
}
84+
)
85+
86+
merge_answers_node = MergeAnswersNode(
87+
input="user_prompt & results",
88+
output=["answer"],
89+
node_config={
90+
"llm_model": self.llm_model,
91+
"schema": self.schema
92+
}
93+
)
94+
95+
return BaseGraph(
96+
nodes=[
97+
graph_iterator_node,
98+
merge_answers_node,
99+
],
100+
edges=[
101+
(graph_iterator_node, merge_answers_node),
102+
],
103+
entry_point=graph_iterator_node
104+
)
105+
106+
def run(self) -> str:
107+
"""
108+
Executes the web scraping and searching process.
109+
110+
Returns:
111+
str: The answer to the prompt.
112+
"""
113+
inputs = {"user_prompt": self.prompt, "jsons": self.source}
114+
self.final_state, self.execution_info = self.graph.execute(inputs)
115+
116+
return self.final_state.get("answer", "No answer found.")
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""
2+
XMLScraperMultiGraph Module
3+
"""
4+
5+
from copy import copy, deepcopy
6+
from typing import List, Optional
7+
8+
from .base_graph import BaseGraph
9+
from .abstract_graph import AbstractGraph
10+
from .xml_scraper_graph import XMLScraperGraph
11+
12+
from ..nodes import (
13+
GraphIteratorNode,
14+
MergeAnswersNode
15+
)
16+
17+
18+
class XMLScraperMultiGraph(AbstractGraph):
19+
"""
20+
XMLScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and
21+
generates answers to a given prompt.
22+
It only requires a user prompt and a list of URLs.
23+
24+
Attributes:
25+
prompt (str): The user prompt to search the internet.
26+
llm_model (dict): The configuration for the language model.
27+
embedder_model (dict): The configuration for the embedder model.
28+
headless (bool): A flag to run the browser in headless mode.
29+
verbose (bool): A flag to display the execution information.
30+
model_token (int): The token limit for the language model.
31+
32+
Args:
33+
prompt (str): The user prompt to search the internet.
34+
source (List[str]): The source of the graph.
35+
config (dict): Configuration parameters for the graph.
36+
schema (Optional[str]): The schema for the graph output.
37+
38+
Example:
39+
>>> search_graph = MultipleSearchGraph(
40+
... "What is Chioggia famous for?",
41+
... {"llm": {"model": "gpt-3.5-turbo"}}
42+
... )
43+
>>> result = search_graph.run()
44+
"""
45+
46+
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
47+
48+
self.max_results = config.get("max_results", 3)
49+
50+
if all(isinstance(value, str) for value in config.values()):
51+
self.copy_config = copy(config)
52+
else:
53+
self.copy_config = deepcopy(config)
54+
55+
super().__init__(prompt, config, source, schema)
56+
57+
def _create_graph(self) -> BaseGraph:
58+
"""
59+
Creates the graph of nodes representing the workflow for web scraping and searching.
60+
61+
Returns:
62+
BaseGraph: A graph instance representing the web scraping and searching workflow.
63+
"""
64+
65+
# ************************************************
66+
# Create a SmartScraperGraph instance
67+
# ************************************************
68+
69+
smart_scraper_instance = XMLScraperGraph(
70+
prompt="",
71+
source="",
72+
config=self.copy_config,
73+
)
74+
75+
# ************************************************
76+
# Define the graph nodes
77+
# ************************************************
78+
79+
graph_iterator_node = GraphIteratorNode(
80+
input="user_prompt & jsons",
81+
output=["results"],
82+
node_config={
83+
"graph_instance": smart_scraper_instance,
84+
}
85+
)
86+
87+
merge_answers_node = MergeAnswersNode(
88+
input="user_prompt & results",
89+
output=["answer"],
90+
node_config={
91+
"llm_model": self.llm_model,
92+
"schema": self.schema
93+
}
94+
)
95+
96+
return BaseGraph(
97+
nodes=[
98+
graph_iterator_node,
99+
merge_answers_node,
100+
],
101+
edges=[
102+
(graph_iterator_node, merge_answers_node),
103+
],
104+
entry_point=graph_iterator_node
105+
)
106+
107+
def run(self) -> str:
108+
"""
109+
Executes the web scraping and searching process.
110+
111+
Returns:
112+
str: The answer to the prompt.
113+
"""
114+
inputs = {"user_prompt": self.prompt, "jsons": self.source}
115+
self.final_state, self.execution_info = self.graph.execute(inputs)
116+
117+
return self.final_state.get("answer", "No answer found.")

0 commit comments

Comments
 (0)