Skip to content

Commit 5d1fe68

Browse files
authored
Merge branch 'pre/beta' into temp
2 parents dcef172 + bd2afef commit 5d1fe68

File tree

173 files changed

+3794
-803
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

173 files changed

+3794
-803
lines changed

CHANGELOG.md

Lines changed: 233 additions & 2 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,10 @@ Additional dependecies can be added while installing the library:
3838

3939
- <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
4040

41-
```bash
42-
pip install scrapegraphai[other-language-models]
43-
```
41+
42+
This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
43+
```bash
44+
pip install scrapegraphai[other-language-models]
4445

4546
- <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.
4647

@@ -58,6 +59,13 @@ Additional dependecies can be added while installing the library:
5859

5960

6061

62+
### Installing "More Browser Options"
63+
64+
This group includes an ocr scraper for websites
65+
```bash
66+
pip install scrapegraphai[screenshot_scraper]
67+
```
68+
6169
## 💻 Usage
6270
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
6371

docs/chinese.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ from scrapegraphai.graphs import SpeechGraph
133133
graph_config = {
134134
"llm": {
135135
"api_key": "OPENAI_API_KEY",
136-
"model": "gpt-3.5-turbo",
136+
"model": "openai/gpt-3.5-turbo",
137137
},
138138
"tts_model": {
139139
"api_key": "OPENAI_API_KEY",

docs/japanese.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ from scrapegraphai.graphs import SpeechGraph
133133
graph_config = {
134134
"llm": {
135135
"api_key": "OPENAI_API_KEY",
136-
"model": "gpt-3.5-turbo",
136+
"model": "openai/gpt-3.5-turbo",
137137
},
138138
"tts_model": {
139139
"api_key": "OPENAI_API_KEY",

docs/korean.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ from scrapegraphai.graphs import SpeechGraph
132132
graph_config = {
133133
"llm": {
134134
"api_key": "OPENAI_API_KEY",
135-
"model": "gpt-3.5-turbo",
135+
"model": "openai/gpt-3.5-turbo",
136136
},
137137
"tts_model": {
138138
"api_key": "OPENAI_API_KEY",

docs/russian.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ from scrapegraphai.graphs import SpeechGraph
138138
graph_config = {
139139
"llm": {
140140
"api_key": "OPENAI_API_KEY",
141-
"model": "gpt-3.5-turbo",
141+
"model": "openai/gpt-3.5-turbo",
142142
},
143143
"tts_model": {
144144
"api_key": "OPENAI_API_KEY",

docs/source/getting_started/examples.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ OpenAI models
2222
graph_config = {
2323
"llm": {
2424
"api_key": openai_key,
25-
"model": "gpt-3.5-turbo",
25+
"model": "openai/gpt-3.5-turbo",
2626
},
2727
}
2828

examples/anthropic/custom_graph_haiku.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040

4141
fetch_node = FetchNode(
4242
input="url | local_dir",
43-
output=["doc", "link_urls", "img_urls"],
43+
output=["doc"],
4444
node_config={
4545
"verbose": True,
4646
"headless": True,
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper while setting an API rate limit.
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
11+
# required environment variables in .env
12+
# ANTHROPIC_API_KEY
13+
load_dotenv()
14+
15+
# ************************************************
16+
# Create the SmartScraperGraph instance and run it
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"api_key": os.getenv("ANTHROPIC_API_KEY"),
22+
"model": "anthropic/claude-3-haiku-20240307",
23+
"rate_limit": {
24+
"requests_per_second": 1
25+
}
26+
},
27+
}
28+
29+
smart_scraper_graph = SmartScraperGraph(
30+
prompt="""Don't say anything else. Output JSON only. List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time,
31+
event_end_date, event_end_time, location, event_mode, event_category,
32+
third_party_redirect, no_of_days,
33+
time_in_hours, hosted_or_attending, refreshments_type,
34+
registration_available, registration_link""",
35+
# also accepts a string with the already downloaded HTML code
36+
source="https://www.hmhco.com/event",
37+
config=graph_config
38+
)
39+
40+
result = smart_scraper_graph.run()
41+
print(result)
42+
43+
# ************************************************
44+
# Get graph execution info
45+
# ************************************************
46+
47+
graph_exec_info = smart_scraper_graph.get_execution_info()
48+
print(prettify_exec_info(graph_exec_info))
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperMultiConcatGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Create the SmartScraperGraph instance and run it
14+
# ************************************************
15+
16+
graph_config = {
17+
"llm": {
18+
"api_key": os.getenv("ANTHROPIC_API_KEY"),
19+
"model": "anthropic/claude-3-haiku-20240307",
20+
},
21+
}
22+
23+
24+
# *******************************************************
25+
# Create the SmartScraperMultiGraph instance and run it
26+
# *******************************************************
27+
28+
multiple_search_graph = SmartScraperMultiConcatGraph(
29+
prompt="Who is Marco Perini?",
30+
source= [
31+
"https://perinim.github.io/",
32+
"https://perinim.github.io/cv/"
33+
],
34+
schema=None,
35+
config=graph_config
36+
)
37+
38+
result = multiple_search_graph.run()
39+
print(json.dumps(result, indent=4))

examples/azure/rate_limit_azure.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper with a custom rate limit
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
11+
# required environment variable in .env
12+
# AZURE_OPENAI_ENDPOINT
13+
# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
14+
# MODEL_NAME
15+
# AZURE_OPENAI_API_KEY
16+
# OPENAI_API_TYPE
17+
# AZURE_OPENAI_API_VERSION
18+
# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME
19+
load_dotenv()
20+
21+
22+
# ************************************************
23+
# Initialize the model instances
24+
# ************************************************
25+
26+
graph_config = {
27+
"llm": {
28+
"api_key": os.environ["AZURE_OPENAI_KEY"],
29+
"model": "azure_openai/gpt-3.5-turbo",
30+
"rate_limit": {
31+
"requests_per_second": 1
32+
},
33+
},
34+
"verbose": True,
35+
"headless": False
36+
}
37+
38+
smart_scraper_graph = SmartScraperGraph(
39+
prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time,
40+
event_end_date, event_end_time, location, event_mode, event_category,
41+
third_party_redirect, no_of_days,
42+
time_in_hours, hosted_or_attending, refreshments_type,
43+
registration_available, registration_link""",
44+
# also accepts a string with the already downloaded HTML code
45+
source="https://www.hmhco.com/event",
46+
config=graph_config
47+
)
48+
49+
result = smart_scraper_graph.run()
50+
print(result)
51+
52+
# ************************************************
53+
# Get graph execution info
54+
# ************************************************
55+
56+
graph_exec_info = smart_scraper_graph.get_execution_info()
57+
print(prettify_exec_info(graph_exec_info))

examples/azure/smart_scraper_multi_azure.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
"""
22
Basic example of scraping pipeline using SmartScraper
33
"""
4-
5-
import os, json
4+
import os
5+
import json
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import SmartScraperMultiGraph
88

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperMultiConcatGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
graph_config = {
16+
"llm": {
17+
"api_key": os.environ["AZURE_OPENAI_KEY"],
18+
"model": "azure_openai/gpt-3.5-turbo",
19+
},
20+
"verbose": True,
21+
"headless": False
22+
}
23+
24+
# *******************************************************
25+
# Create the SmartScraperMultiGraph instance and run it
26+
# *******************************************************
27+
28+
multiple_search_graph = SmartScraperMultiConcatGraph(
29+
prompt="Who is Marco Perini?",
30+
source= [
31+
"https://perinim.github.io/",
32+
"https://perinim.github.io/cv/"
33+
],
34+
schema=None,
35+
config=graph_config
36+
)
37+
38+
result = multiple_search_graph.run()
39+
print(json.dumps(result, indent=4))

examples/bedrock/custom_graph_bedrock.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555

5656
fetch_node = FetchNode(
5757
input="url | local_dir",
58-
output=["doc", "link_urls", "img_urls"],
58+
output=["doc"],
5959
node_config={
6060
"verbose": True,
6161
"headless": True,
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper with a custom rate limit
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
graph_config = {
18+
"llm": {
19+
"client": "client_name",
20+
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
21+
"temperature": 0.0,
22+
"rate_limit": {
23+
"requests_per_second": 1
24+
},
25+
}
26+
}
27+
28+
# ************************************************
29+
# Create the SmartScraperGraph instance and run it
30+
# ************************************************
31+
32+
smart_scraper_graph = SmartScraperGraph(
33+
prompt="List me all the projects with their description",
34+
# also accepts a string with the already downloaded HTML code
35+
source="https://perinim.github.io/projects/",
36+
config=graph_config
37+
)
38+
39+
result = smart_scraper_graph.run()
40+
print(result)
41+
42+
# ************************************************
43+
# Get graph execution info
44+
# ************************************************
45+
46+
graph_exec_info = smart_scraper_graph.get_execution_info()
47+
print(prettify_exec_info(graph_exec_info))

examples/bedrock/smart_scraper_multi_bedrock.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
"""
22
Basic example of scraping pipeline using SmartScraper
33
"""
4-
5-
import os, json
6-
from dotenv import load_dotenv
4+
import json
75
from scrapegraphai.graphs import SmartScraperMultiGraph
86

9-
load_dotenv()
107

118
# ************************************************
129
# Define the configuration for the graph
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
import json
5+
from scrapegraphai.graphs import SmartScraperMultiConcatGraph
6+
7+
# ************************************************
8+
# Define the configuration for the graph
9+
# ************************************************
10+
11+
graph_config = {
12+
"llm": {
13+
"client": "client_name",
14+
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
15+
"temperature": 0.0
16+
}
17+
}
18+
19+
20+
# *******************************************************
21+
# Create the SmartScraperMultiGraph instance and run it
22+
# *******************************************************
23+
24+
multiple_search_graph = SmartScraperMultiConcatGraph(
25+
prompt="Who is Marco Perini?",
26+
source= [
27+
"https://perinim.github.io/",
28+
"https://perinim.github.io/cv/"
29+
],
30+
schema=None,
31+
config=graph_config
32+
)
33+
34+
result = multiple_search_graph.run()
35+
print(json.dumps(result, indent=4))

examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
graph_config = {
2525
"llm": {
2626
"api_key": openai_key,
27-
"model": "gpt-3.5-turbo",
27+
"model": "openai/gpt-3.5-turbo",
2828
},
2929
"library": "beautifoulsoup"
3030
}

0 commit comments

Comments
 (0)