Skip to content

Commit 2804434

Browse files
committed
feat: add integrations for markdown files
1 parent f3cbbce commit 2804434

File tree

9 files changed

+335
-14
lines changed

9 files changed

+335
-14
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
Marco Perini Toggle navigation
2+
3+
* About
4+
* Projects(current)
5+
6+
Projects
7+
8+
Competitions
9+
10+
* CV
11+
* ____
12+
13+
# Projects
14+
15+
![project thumbnail Rotary Pendulum RL
16+
Open Source project aimed at controlling a real life rotary pendulum using RL
17+
algorithms ](/projects/rotary-pendulum-rl/)
18+
19+
![project thumbnail DQN
20+
Implementation from scratch Developed a Deep Q-Network algorithm to train a
21+
simple and double pendulum ](https://github.com/PeriniM/DQN-SwingUp)
22+
23+
![project thumbnail Multi Agents HAED
24+
University project which focuses on simulating a multi-agent system to perform
25+
environment mapping. Agents, equipped with sensors, explore and record their
26+
surroundings, considering uncertainties in their readings.
27+
](https://github.com/PeriniM/Multi-Agents-HAED)
28+
29+
![project thumbnail Wireless ESC for Modular
30+
Drones Modular drone architecture proposal and proof of concept. The project
31+
received maximum grade. ](/projects/wireless-esc-drone/)
32+
33+
© Copyright 2023 Marco Perini. Powered by Jekyll with
34+
al-folio theme. Hosted by [GitHub
35+
Pages](https://pages.github.com/).

examples/openai/md_scraper_openai.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Basic example of scraping pipeline using MDScraperGraph from XML documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import MDScraperGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the XML file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/markdown_example.md"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
26+
openai_key = os.getenv("OPENAI_APIKEY")
27+
28+
graph_config = {
29+
"llm": {
30+
"api_key": openai_key,
31+
"model": "gpt-3.5-turbo",
32+
},
33+
}
34+
35+
# ************************************************
36+
# Create the MDScraperGraph instance and run it
37+
# ************************************************
38+
39+
md_scraper_graph = MDScraperGraph(
40+
prompt="List me all the authors, title and genres of the books",
41+
source=text, # Pass the content of the file, not the file object
42+
config=graph_config
43+
)
44+
45+
result = md_scraper_graph.run()
46+
print(result)
47+
48+
# ************************************************
49+
# Get graph execution info
50+
# ************************************************
51+
52+
graph_exec_info = md_scraper_graph.get_execution_info()
53+
print(prettify_exec_info(graph_exec_info))
54+
55+
# Save to json or csv
56+
convert_to_csv(result, "result")
57+
convert_to_json(result, "result")

scrapegraphai/graphs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,5 @@
2121
from .csv_scraper_multi_graph import CSVScraperMultiGraph
2222
from .xml_scraper_multi_graph import XMLScraperMultiGraph
2323
from .script_creator_multi_graph import ScriptCreatorMultiGraph
24+
from .markdown_scraper_graph import MDScraperGraph
25+
from .markdown_scraper_multi_graph import MDScraperMultiGraph
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from typing import Optional
2+
import logging
3+
from pydantic import BaseModel
4+
from .base_graph import BaseGraph
5+
from .abstract_graph import AbstractGraph
6+
from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
7+
8+
class MDScraperGraph(AbstractGraph):
9+
"""
10+
MDScraperGraph is a scraping pipeline that automates the process of
11+
extracting information from web pages using a natural language model to interpret
12+
and answer prompts.
13+
14+
Attributes:
15+
prompt (str): The prompt for the graph.
16+
source (str): The source of the graph.
17+
config (dict): Configuration parameters for the graph.
18+
schema (BaseModel): The schema for the graph output.
19+
llm_model: An instance of a language model client, configured for generating answers.
20+
embedder_model: An instance of an embedding model client, configured for generating embeddings.
21+
verbose (bool): A flag indicating whether to show print statements during execution.
22+
headless (bool): A flag indicating whether to run the graph in headless mode.
23+
24+
Args:
25+
prompt (str): The prompt for the graph.
26+
source (str): The source of the graph.
27+
config (dict): Configuration parameters for the graph.
28+
schema (BaseModel): The schema for the graph output.
29+
30+
Example:
31+
>>> smart_scraper = MDScraperGraph(
32+
... "List me all the attractions in Chioggia.",
33+
... "https://en.wikipedia.org/wiki/Chioggia",
34+
... {"llm": {"model": "gpt-3.5-turbo"}}
35+
... )
36+
>>> result = smart_scraper.run()
37+
"""
38+
39+
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
40+
super().__init__(prompt, config, source, schema)
41+
42+
self.input_key = "md" if source.endswith("md") else "md_dir"
43+
44+
def _create_graph(self) -> BaseGraph:
45+
"""
46+
Creates the graph of nodes representing the workflow for web scraping.
47+
48+
Returns:
49+
BaseGraph: A graph instance representing the web scraping workflow.
50+
"""
51+
fetch_node = FetchNode(
52+
input="md | md_dir",
53+
output=["doc"],
54+
node_config={
55+
"loader_kwargs": self.config.get("loader_kwargs", {}),
56+
}
57+
)
58+
parse_node = ParseNode(
59+
input="doc",
60+
output=["parsed_doc"],
61+
node_config={
62+
"parse_html": False,
63+
"chunk_size": self.model_token
64+
}
65+
)
66+
rag_node = RAGNode(
67+
input="user_prompt & (parsed_doc | doc)",
68+
output=["relevant_chunks"],
69+
node_config={
70+
"llm_model": self.llm_model,
71+
"embedder_model": self.embedder_model
72+
}
73+
)
74+
generate_answer_node = GenerateAnswerNode(
75+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
76+
output=["answer"],
77+
node_config={
78+
"llm_model": self.llm_model,
79+
"schema": self.schema,
80+
}
81+
)
82+
83+
return BaseGraph(
84+
nodes=[
85+
fetch_node,
86+
parse_node,
87+
rag_node,
88+
generate_answer_node,
89+
],
90+
edges=[
91+
(fetch_node, parse_node),
92+
(parse_node, rag_node),
93+
(rag_node, generate_answer_node)
94+
],
95+
entry_point=fetch_node,
96+
graph_name=self.__class__.__name__
97+
)
98+
99+
def run(self) -> str:
100+
"""
101+
Executes the scraping process and returns the answer to the prompt.
102+
103+
Returns:
104+
str: The answer to the prompt.
105+
"""
106+
107+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
108+
self.final_state, self.execution_info = self.graph.execute(inputs)
109+
110+
return self.final_state.get("answer", "No answer found.")
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
"""
2+
MDScraperMultiGraph Module
3+
"""
4+
5+
from copy import copy, deepcopy
6+
from typing import List, Optional
7+
from pydantic import BaseModel
8+
9+
from .base_graph import BaseGraph
10+
from .abstract_graph import AbstractGraph
11+
from .markdown_scraper_graph import MDScraperGraph
12+
13+
from ..nodes import (
14+
GraphIteratorNode,
15+
MergeAnswersNode
16+
)
17+
18+
19+
class MDScraperMultiGraph(AbstractGraph):
20+
"""
21+
MDScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and
22+
generates answers to a given prompt. It only requires a user prompt and a list of URLs.
23+
24+
Attributes:
25+
prompt (str): The user prompt to search the internet.
26+
llm_model (dict): The configuration for the language model.
27+
embedder_model (dict): The configuration for the embedder model.
28+
headless (bool): A flag to run the browser in headless mode.
29+
verbose (bool): A flag to display the execution information.
30+
model_token (int): The token limit for the language model.
31+
32+
Args:
33+
prompt (str): The user prompt to search the internet.
34+
source (List[str]): The list of URLs to scrape.
35+
config (dict): Configuration parameters for the graph.
36+
schema (Optional[BaseModel]): The schema for the graph output.
37+
38+
Example:
39+
>>> search_graph = MDScraperMultiGraph(
40+
... "What is Chioggia famous for?",
41+
... ["http://example.com/page1", "http://example.com/page2"],
42+
... {"llm_model": {"model": "gpt-3.5-turbo"}}
43+
... )
44+
>>> result = search_graph.run()
45+
"""
46+
47+
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
48+
if all(isinstance(value, str) for value in config.values()):
49+
self.copy_config = copy(config)
50+
else:
51+
self.copy_config = deepcopy(config)
52+
53+
self.copy_schema = deepcopy(schema)
54+
55+
super().__init__(prompt, config, source, schema)
56+
57+
def _create_graph(self) -> BaseGraph:
58+
"""
59+
Creates the graph of nodes representing the workflow for web scraping and searching.
60+
61+
Returns:
62+
BaseGraph: A graph instance representing the web scraping and searching workflow.
63+
"""
64+
# Create a SmartScraperGraph instance
65+
smart_scraper_instance = MDScraperGraph(
66+
prompt="",
67+
source="",
68+
config=self.copy_config,
69+
schema=self.copy_schema
70+
)
71+
72+
# Define the graph nodes
73+
graph_iterator_node = GraphIteratorNode(
74+
input="user_prompt & jsons",
75+
output=["results"],
76+
node_config={
77+
"graph_instance": smart_scraper_instance,
78+
}
79+
)
80+
81+
merge_answers_node = MergeAnswersNode(
82+
input="user_prompt & results",
83+
output=["answer"],
84+
node_config={
85+
"llm_model": self.llm_model,
86+
"schema": self.schema
87+
}
88+
)
89+
90+
return BaseGraph(
91+
nodes=[
92+
graph_iterator_node,
93+
merge_answers_node,
94+
],
95+
edges=[
96+
(graph_iterator_node, merge_answers_node),
97+
],
98+
entry_point=graph_iterator_node,
99+
graph_name=self.__class__.__name__
100+
)
101+
102+
def run(self) -> str:
103+
"""
104+
Executes the web scraping and searching process.
105+
106+
Returns:
107+
str: The answer to the prompt.
108+
"""
109+
inputs = {"user_prompt": self.prompt, "xmls": self.source}
110+
self.final_state, self.execution_info = self.graph.execute(inputs)
111+
112+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/graphs/pdf_scraper_multi_graph.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,6 @@ class PdfScraperMultiGraph(AbstractGraph):
4646

4747
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
4848

49-
self.max_results = config.get("max_results", 3)
50-
5149
if all(isinstance(value, str) for value in config.values()):
5250
self.copy_config = copy(config)
5351
else:

scrapegraphai/graphs/xml_scraper_multi_graph.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,6 @@ class XMLScraperMultiGraph(AbstractGraph):
4646

4747
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
4848

49-
self.max_results = config.get("max_results", 3)
50-
5149
if all(isinstance(value, str) for value in config.values()):
5250
self.copy_config = copy(config)
5351
else:
@@ -116,7 +114,7 @@ def run(self) -> str:
116114
Returns:
117115
str: The answer to the prompt.
118116
"""
119-
inputs = {"user_prompt": self.prompt, "jsons": self.source}
117+
inputs = {"user_prompt": self.prompt, "xmls": self.source}
120118
self.final_state, self.execution_info = self.graph.execute(inputs)
121119

122120
return self.final_state.get("answer", "No answer found.")

0 commit comments

Comments
 (0)