Skip to content

Commit a296927

Browse files
committed
feat(omni-scraper): working OmniScraperGraph with images
1 parent 90955ca commit a296927

12 files changed

+516
-87
lines changed
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
"""
2+
Example of custom graph using existing nodes
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
8+
from langchain_openai import OpenAIEmbeddings
9+
from scrapegraphai.models import OpenAI, OpenAIImageToText
10+
from scrapegraphai.graphs import BaseGraph
11+
from scrapegraphai.nodes import FetchNode, ParseNode, ImageToTextNode, RAGNode, GenerateAnswerOmniNode
12+
load_dotenv()
13+
14+
# ************************************************
15+
# Define the configuration for the graph
16+
# ************************************************
17+
18+
openai_key = os.getenv("OPENAI_APIKEY")
19+
20+
graph_config = {
21+
"llm": {
22+
"api_key": openai_key,
23+
"model": "gpt-4o",
24+
"temperature": 0,
25+
"streaming": False
26+
},
27+
}
28+
29+
# ************************************************
30+
# Define the graph nodes
31+
# ************************************************
32+
33+
llm_model = OpenAI(graph_config["llm"])
34+
iit_model = OpenAIImageToText(graph_config["llm"])
35+
embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key)
36+
37+
# define the nodes for the graph
38+
39+
fetch_node = FetchNode(
40+
input="url | local_dir",
41+
output=["doc", "link_urls", "img_urls"],
42+
node_config={
43+
"verbose": True,
44+
"headless": True,
45+
}
46+
)
47+
parse_node = ParseNode(
48+
input="doc",
49+
output=["parsed_doc"],
50+
node_config={
51+
"chunk_size": 4096,
52+
"verbose": True,
53+
}
54+
)
55+
image_to_text_node = ImageToTextNode(
56+
input="img_urls",
57+
output=["img_desc"],
58+
node_config={
59+
"llm_model": iit_model,
60+
"max_images": 4,
61+
}
62+
)
63+
rag_node = RAGNode(
64+
input="user_prompt & (parsed_doc | doc)",
65+
output=["relevant_chunks"],
66+
node_config={
67+
"llm_model": llm_model,
68+
"embedder_model": embedder,
69+
"verbose": True,
70+
}
71+
)
72+
generate_answer_omni_node = GenerateAnswerOmniNode(
73+
input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
74+
output=["answer"],
75+
node_config={
76+
"llm_model": llm_model,
77+
"verbose": True,
78+
}
79+
)
80+
81+
# ************************************************
82+
# Create the graph by defining the connections
83+
# ************************************************
84+
85+
graph = BaseGraph(
86+
nodes=[
87+
fetch_node,
88+
parse_node,
89+
image_to_text_node,
90+
rag_node,
91+
generate_answer_omni_node,
92+
],
93+
edges=[
94+
(fetch_node, parse_node),
95+
(parse_node, image_to_text_node),
96+
(image_to_text_node, rag_node),
97+
(rag_node, generate_answer_omni_node)
98+
],
99+
entry_point=fetch_node
100+
)
101+
102+
# ************************************************
103+
# Execute the graph
104+
# ************************************************
105+
106+
result, execution_info = graph.execute({
107+
"user_prompt": "List me all the projects with their titles and image links and descriptions.",
108+
"url": "https://perinim.github.io/projects/"
109+
})
110+
111+
# get the answer from the result
112+
result = result.get("answer", "No answer found.")
113+
print(result)
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Basic example of scraping pipeline using OmniScraper
3+
"""
4+
5+
import os, json
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import OmniScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info, convert_to_csv
9+
10+
load_dotenv()
11+
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
openai_key = os.getenv("OPENAI_APIKEY")
18+
19+
graph_config = {
20+
"llm": {
21+
"api_key": openai_key,
22+
"model": "gpt-4o",
23+
},
24+
"verbose": True,
25+
"headless": False,
26+
}
27+
28+
# ************************************************
29+
# Create the OmniScraperGraph instance and run it
30+
# ************************************************
31+
32+
omni_scraper_graph = OmniScraperGraph(
33+
prompt="List me all the projects with their titles and image links and descriptions.",
34+
# also accepts a string with the already downloaded HTML code
35+
source="https://perinim.github.io/projects/",
36+
config=graph_config
37+
)
38+
39+
result = omni_scraper_graph.run()
40+
print(json.dumps(result, indent=2))
41+
42+
# ************************************************
43+
# Get graph execution info
44+
# ************************************************
45+
46+
graph_exec_info = omni_scraper_graph.get_execution_info()
47+
print(prettify_exec_info(graph_exec_info))

examples/single_node/image2text_node.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@
4343
# ************************************************
4444

4545
state = {
46-
"img_url": "https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/assets/scrapegraphai_logo.png?raw=true"
46+
"img_url": [
47+
"https://perinim.github.io/assets/img/rotary_pybullet.jpg",
48+
"https://perinim.github.io/assets/img/value-policy-heatmaps.jpg",
49+
],
4750
}
4851

4952
result = image_to_text_node.execute(state)

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@
1313
from .json_scraper_graph import JSONScraperGraph
1414
from .csv_scraper_graph import CSVScraperGraph
1515
from .pdf_scraper_graph import PDFScraperGraph
16+
from .omni_scraper_graph import OmniScraperGraph
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
"""
2+
OmniScraperGraph Module
3+
"""
4+
5+
from .base_graph import BaseGraph
6+
from ..nodes import (
7+
FetchNode,
8+
ParseNode,
9+
ImageToTextNode,
10+
RAGNode,
11+
GenerateAnswerOmniNode
12+
)
13+
from scrapegraphai.models import OpenAIImageToText
14+
from .abstract_graph import AbstractGraph
15+
16+
17+
class OmniScraperGraph(AbstractGraph):
18+
"""
19+
OmniScraper is a scraping pipeline that automates the process of
20+
extracting information from web pages
21+
using a natural language model to interpret and answer prompts.
22+
23+
Attributes:
24+
prompt (str): The prompt for the graph.
25+
source (str): The source of the graph.
26+
config (dict): Configuration parameters for the graph.
27+
llm_model: An instance of a language model client, configured for generating answers.
28+
embedder_model: An instance of an embedding model client,
29+
configured for generating embeddings.
30+
verbose (bool): A flag indicating whether to show print statements during execution.
31+
headless (bool): A flag indicating whether to run the graph in headless mode.
32+
33+
Args:
34+
prompt (str): The prompt for the graph.
35+
source (str): The source of the graph.
36+
config (dict): Configuration parameters for the graph.
37+
38+
Example:
39+
>>> omni_scraper = OmniScraperGraph(
40+
... "List me all the attractions in Chioggia and describe their pictures.",
41+
... "https://en.wikipedia.org/wiki/Chioggia",
42+
... {"llm": {"model": "gpt-4o"}}
43+
... )
44+
>>> result = omni_scraper.run()
45+
)
46+
"""
47+
48+
def __init__(self, prompt: str, source: str, config: dict):
49+
50+
self.max_images = 5 if config is None else config.get("max_images", 5)
51+
52+
super().__init__(prompt, config, source)
53+
54+
self.input_key = "url" if source.startswith("http") else "local_dir"
55+
56+
57+
def _create_graph(self) -> BaseGraph:
58+
"""
59+
Creates the graph of nodes representing the workflow for web scraping.
60+
61+
Returns:
62+
BaseGraph: A graph instance representing the web scraping workflow.
63+
"""
64+
fetch_node = FetchNode(
65+
input="url | local_dir",
66+
output=["doc", "link_urls", "img_urls"],
67+
node_config={
68+
"loader_kwargs": self.config.get("loader_kwargs", {}),
69+
}
70+
)
71+
parse_node = ParseNode(
72+
input="doc",
73+
output=["parsed_doc"],
74+
node_config={
75+
"chunk_size": self.model_token
76+
}
77+
)
78+
image_to_text_node = ImageToTextNode(
79+
input="img_urls",
80+
output=["img_desc"],
81+
node_config={
82+
"llm_model": OpenAIImageToText(self.config["llm"]),
83+
"max_images": self.max_images
84+
}
85+
)
86+
rag_node = RAGNode(
87+
input="user_prompt & (parsed_doc | doc)",
88+
output=["relevant_chunks"],
89+
node_config={
90+
"llm_model": self.llm_model,
91+
"embedder_model": self.embedder_model
92+
}
93+
)
94+
generate_answer_omni_node = GenerateAnswerOmniNode(
95+
input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
96+
output=["answer"],
97+
node_config={
98+
"llm_model": self.llm_model
99+
}
100+
)
101+
102+
return BaseGraph(
103+
nodes=[
104+
fetch_node,
105+
parse_node,
106+
image_to_text_node,
107+
rag_node,
108+
generate_answer_omni_node,
109+
],
110+
edges=[
111+
(fetch_node, parse_node),
112+
(parse_node, image_to_text_node),
113+
(image_to_text_node, rag_node),
114+
(rag_node, generate_answer_omni_node)
115+
],
116+
entry_point=fetch_node
117+
)
118+
119+
def run(self) -> str:
120+
"""
121+
Executes the scraping process and returns the answer to the prompt.
122+
123+
Returns:
124+
str: The answer to the prompt.
125+
"""
126+
127+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
128+
self.final_state, self.execution_info = self.graph.execute(inputs)
129+
130+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/nodes/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@
1818
from .generate_answer_csv_node import GenerateAnswerCSVNode
1919
from .generate_answer_pdf_node import GenerateAnswerPDFNode
2020
from .graph_iterator_node import GraphIteratorNode
21-
from .merge_answers_node import MergeAnswersNode
21+
from .merge_answers_node import MergeAnswersNode
22+
from .generate_answer_omni_node import GenerateAnswerOmniNode

scrapegraphai/nodes/fetch_node.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,15 +118,18 @@ def execute(self, state):
118118
pass
119119

120120
elif not source.startswith("http"):
121-
compressed_document = [Document(page_content=cleanup_html(data, source),
121+
title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
122+
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
123+
compressed_document = [Document(page_content=parsed_content,
122124
metadata={"source": "local_dir"}
123125
)]
124126

125127
elif self.useSoup:
126128
response = requests.get(source)
127129
if response.status_code == 200:
128-
cleanedup_html = cleanup_html(response.text, source)
129-
compressed_document = [Document(page_content=cleanedup_html)]
130+
title, minimized_body, link_urls, image_urls = cleanup_html(response.text, source)
131+
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
132+
compressed_document = [Document(page_content=parsed_content)]
130133
else:
131134
print(f"Failed to retrieve contents from the webpage at url: {source}")
132135

@@ -137,11 +140,14 @@ def execute(self, state):
137140
loader_kwargs = self.node_config.get("loader_kwargs", {})
138141

139142
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
140-
141143
document = loader.load()
144+
145+
title, minimized_body, link_urls, image_urls = cleanup_html(str(document[0].page_content), source)
146+
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
147+
142148
compressed_document = [
143-
Document(page_content=cleanup_html(str(document[0].page_content), source), metadata={"source": source})
149+
Document(page_content=parsed_content, metadata={"source": source})
144150
]
145151

146-
state.update({self.output[0]: compressed_document})
152+
state.update({self.output[0]: compressed_document, self.output[1]: link_urls, self.output[2]: image_urls})
147153
return state

0 commit comments

Comments
 (0)