Skip to content

Commit 3bf5f57

Browse files
committed
feat: add integration for infos
1 parent e3a19c2 commit 3bf5f57

16 files changed

+96
-6
lines changed

examples/extras/custom_prompt.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
import os
5+
import json
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
openai_key = os.getenv("OPENAI_APIKEY")
18+
19+
prompt = "Some more info"
20+
21+
graph_config = {
22+
"llm": {
23+
"api_key": openai_key,
24+
"model": "gpt-3.5-turbo",
25+
},
26+
"additional_info": prompt,
27+
"verbose": True,
28+
"headless": False,
29+
}
30+
31+
# ************************************************
32+
# Create the SmartScraperGraph instance and run it
33+
# ************************************************
34+
35+
smart_scraper_graph = SmartScraperGraph(
36+
prompt="List me all the projects with their description",
37+
# also accepts a string with the already downloaded HTML code
38+
source="https://perinim.github.io/projects/",
39+
config=graph_config,
40+
)
41+
42+
result = smart_scraper_graph.run()
43+
print(json.dumps(result, indent=4))
44+
45+
# ************************************************
46+
# Get graph execution info
47+
# ************************************************
48+
49+
graph_exec_info = smart_scraper_graph.get_execution_info()
50+
print(prettify_exec_info(graph_exec_info))

scrapegraphai/graphs/csv_scraper_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def _create_graph(self):
5050
output=["answer"],
5151
node_config={
5252
"llm_model": self.llm_model,
53+
"additional_info": self.config.get("additional_info"),
5354
"schema": self.schema,
5455
}
5556
)

scrapegraphai/graphs/deep_scraper_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ def _create_repeated_graph(self) -> BaseGraph:
9595
output=["answer"],
9696
node_config={
9797
"llm_model": self.llm_model,
98+
"additional_info": self.config.get("additional_info"),
9899
"schema": self.schema
99100
}
100101
)

scrapegraphai/graphs/json_scraper_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def _create_graph(self) -> BaseGraph:
7575
output=["answer"],
7676
node_config={
7777
"llm_model": self.llm_model,
78+
"additional_info": self.config.get("additional_info"),
7879
"schema": self.schema
7980
}
8081
)

scrapegraphai/graphs/markdown_scraper_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ def _create_graph(self) -> BaseGraph:
7676
output=["answer"],
7777
node_config={
7878
"llm_model": self.llm_model,
79+
"additional_info": self.config.get("additional_info"),
7980
"schema": self.schema,
8081
}
8182
)

scrapegraphai/graphs/omni_scraper_graph.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
from ..models import OpenAIImageToText
2020

21-
2221
class OmniScraperGraph(AbstractGraph):
2322
"""
2423
OmniScraper is a scraping pipeline that automates the process of
@@ -60,7 +59,6 @@ def __init__(self, prompt: str, source: str, config: dict, schema: Optional[Base
6059
super().__init__(prompt, config, source, schema)
6160

6261
self.input_key = "url" if source.startswith("http") else "local_dir"
63-
6462

6563
def _create_graph(self) -> BaseGraph:
6664
"""
@@ -104,6 +102,7 @@ def _create_graph(self) -> BaseGraph:
104102
output=["answer"],
105103
node_config={
106104
"llm_model": self.llm_model,
105+
"additional_info": self.config.get("additional_info"),
107106
"schema": self.schema
108107
}
109108
)

scrapegraphai/graphs/pdf_scraper_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def _create_graph(self) -> BaseGraph:
8989
output=["answer"],
9090
node_config={
9191
"llm_model": self.llm_model,
92+
"additional_info": self.config.get("additional_info"),
9293
"schema": self.schema
9394
}
9495
)

scrapegraphai/graphs/script_creator_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def _create_graph(self) -> BaseGraph:
8484
output=["answer"],
8585
node_config={
8686
"llm_model": self.llm_model,
87+
"additional_info": self.config.get("additional_info"),
8788
"schema": self.schema,
8889
},
8990
library=self.library,

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ def _create_graph(self) -> BaseGraph:
9191
output=["answer"],
9292
node_config={
9393
"llm_model": self.llm_model,
94+
"additional_info": self.config.get("additional_info"),
9495
"schema": self.schema,
9596
}
9697
)

scrapegraphai/graphs/speech_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def _create_graph(self) -> BaseGraph:
8484
output=["answer"],
8585
node_config={
8686
"llm_model": self.llm_model,
87+
"additional_info": self.config.get("additional_info"),
8788
"schema": self.schema
8889
}
8990
)

scrapegraphai/graphs/xml_scraper_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def _create_graph(self) -> BaseGraph:
7777
output=["answer"],
7878
node_config={
7979
"llm_model": self.llm_model,
80+
"additional_info": self.config.get("additional_info"),
8081
"schema": self.schema
8182
}
8283
)

scrapegraphai/nodes/generate_answer_csv_node.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,14 @@ def __init__(
5858
node_name (str): name of the node
5959
"""
6060
super().__init__(node_name, "node", input, output, 2, node_config)
61-
61+
6262
self.llm_model = node_config["llm_model"]
63+
6364
self.verbose = (
6465
False if node_config is None else node_config.get("verbose", False)
6566
)
67+
68+
self.additional_info = node_config.get("additional_info")
6669

6770
def execute(self, state):
6871
"""
@@ -99,9 +102,14 @@ def execute(self, state):
99102
output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
100103
else:
101104
output_parser = JsonOutputParser()
105+
106+
if self.additional_info is not None:
107+
template_no_chunks_csv += self.additional_info
108+
template_chunks_csv += self.additional_info
109+
template_merge_csv += self.additional_info
102110

103111
format_instructions = output_parser.get_format_instructions()
104-
112+
105113
chains_dict = {}
106114

107115
# Use tqdm to add progress bar

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def __init__(
5454
False if node_config is None else node_config.get("script_creator", False)
5555
)
5656

57+
self.additional_info = node_config.get("additional_info")
5758

5859
def execute(self, state: dict) -> dict:
5960
"""
@@ -98,6 +99,11 @@ def execute(self, state: dict) -> dict:
9899
template_chunks_prompt = template_chunks
99100
template_merge_prompt = template_merge
100101

102+
if self.additional_info is not None:
103+
template_no_chunks_prompt += self.additional_info
104+
template_chunks_prompt += self.additional_info
105+
template_merge_prompt += self.additional_info
106+
101107
chains_dict = {}
102108

103109
# Use tqdm to add progress bar
@@ -118,7 +124,6 @@ def execute(self, state: dict) -> dict:
118124
partial_variables={"context": chunk.page_content,
119125
"chunk_id": i + 1,
120126
"format_instructions": format_instructions})
121-
122127
# Dynamically name the chains based on their index
123128
chain_name = f"chunk{i+1}"
124129
chains_dict[chain_name] = prompt | self.llm_model | output_parser

scrapegraphai/nodes/generate_answer_omni_node.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,13 @@ def __init__(
4646
self.llm_model = node_config["llm_model"]
4747
if isinstance(node_config["llm_model"], Ollama):
4848
self.llm_model.format="json"
49-
49+
5050
self.verbose = (
5151
False if node_config is None else node_config.get("verbose", False)
5252
)
5353

54+
self.additional_info = node_config.get("additional_info")
55+
5456
def execute(self, state: dict) -> dict:
5557
"""
5658
Generates an answer by constructing a prompt from the user's input and the scraped
@@ -86,6 +88,11 @@ def execute(self, state: dict) -> dict:
8688
else:
8789
output_parser = JsonOutputParser()
8890

91+
if self.additional_info is not None:
92+
template_no_chunk_omni += self.additional_info
93+
template_chunks_omni += self.additional_info
94+
template_merge_omni += self.additional_info
95+
8996
format_instructions = output_parser.get_format_instructions()
9097

9198

scrapegraphai/nodes/generate_answer_pdf_node.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,13 @@ def __init__(
6161
self.llm_model = node_config["llm_model"]
6262
if isinstance(node_config["llm_model"], Ollama):
6363
self.llm_model.format="json"
64+
6465
self.verbose = (
6566
False if node_config is None else node_config.get("verbose", False)
6667
)
6768

69+
self.additional_info = node_config.get("additional_info")
70+
6871
def execute(self, state):
6972
"""
7073
Generates an answer by constructing a prompt from the user's input and the scraped
@@ -100,6 +103,11 @@ def execute(self, state):
100103
output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
101104
else:
102105
output_parser = JsonOutputParser()
106+
107+
if self.additional_info is not None:
108+
template_no_chunks_pdf += self.additional_info
109+
template_chunks_pdf += self.additional_info
110+
template_merge_pdf += self.additional_info
103111

104112
format_instructions = output_parser.get_format_instructions()
105113

scrapegraphai/nodes/generate_scraper_node.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ def __init__(
5454
False if node_config is None else node_config.get("verbose", False)
5555
)
5656

57+
self.additional_info = node_config.get("additional_info")
58+
5759
def execute(self, state: dict) -> dict:
5860
"""
5961
Generates a python script for scraping a website using the specified library.
@@ -106,6 +108,8 @@ def execute(self, state: dict) -> dict:
106108
USER QUESTION: {question}
107109
SCHEMA INSTRUCTIONS: {schema_instructions}
108110
"""
111+
if self.additional_info is not None:
112+
template_no_chunks += self.additional_info
109113

110114
if len(doc) > 1:
111115
raise NotImplementedError(

0 commit comments

Comments
 (0)