Skip to content

Commit 8b51d50

Browse files
authored
Merge pull request #682 from ScrapeGraphAI/temp
Pre/beta
2 parents 31f3f36 + 0cdd47e commit 8b51d50

File tree

208 files changed

+4295
-864
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

208 files changed

+4295
-864
lines changed

CHANGELOG.md

Lines changed: 336 additions & 3 deletions
Large diffs are not rendered by default.

CONTRIBUTING.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,10 @@ If you encounter any issues or have suggestions for improvements, please open an
7474
ScrapeGraphAI is licensed under the **MIT License**. See the [LICENSE](LICENSE) file for more information.
7575
By contributing to this project, you agree to license your contributions under the same license.
7676

77+
ScrapeGraphAI uses code from the Langchain
78+
frameworks. You find their original licenses below.
79+
80+
LANGCHAIN LICENSE
81+
https://github.com/langchain-ai/langchain/blob/master/LICENSE
82+
7783
Can't wait to see your contributions! :smile:

README.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,10 @@ Additional dependecies can be added while installing the library:
3838

3939
- <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
4040

41-
```bash
42-
pip install scrapegraphai[other-language-models]
43-
```
41+
42+
This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
43+
```bash
44+
pip install scrapegraphai[other-language-models]
4445

4546
- <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.
4647

@@ -58,6 +59,13 @@ Additional dependecies can be added while installing the library:
5859

5960

6061

62+
### Installing "More Browser Options"
63+
64+
This group includes an ocr scraper for websites
65+
```bash
66+
pip install scrapegraphai[screenshot_scraper]
67+
```
68+
6169
## 💻 Usage
6270
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
6371

docs/chinese.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ from scrapegraphai.graphs import SpeechGraph
133133
graph_config = {
134134
"llm": {
135135
"api_key": "OPENAI_API_KEY",
136-
"model": "gpt-3.5-turbo",
136+
"model": "openai/gpt-3.5-turbo",
137137
},
138138
"tts_model": {
139139
"api_key": "OPENAI_API_KEY",

docs/japanese.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ from scrapegraphai.graphs import SpeechGraph
133133
graph_config = {
134134
"llm": {
135135
"api_key": "OPENAI_API_KEY",
136-
"model": "gpt-3.5-turbo",
136+
"model": "openai/gpt-3.5-turbo",
137137
},
138138
"tts_model": {
139139
"api_key": "OPENAI_API_KEY",

docs/korean.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ from scrapegraphai.graphs import SpeechGraph
132132
graph_config = {
133133
"llm": {
134134
"api_key": "OPENAI_API_KEY",
135-
"model": "gpt-3.5-turbo",
135+
"model": "openai/gpt-3.5-turbo",
136136
},
137137
"tts_model": {
138138
"api_key": "OPENAI_API_KEY",

docs/russian.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ from scrapegraphai.graphs import SpeechGraph
138138
graph_config = {
139139
"llm": {
140140
"api_key": "OPENAI_API_KEY",
141-
"model": "gpt-3.5-turbo",
141+
"model": "openai/gpt-3.5-turbo",
142142
},
143143
"tts_model": {
144144
"api_key": "OPENAI_API_KEY",

docs/source/getting_started/examples.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ OpenAI models
2222
graph_config = {
2323
"llm": {
2424
"api_key": openai_key,
25-
"model": "gpt-3.5-turbo",
25+
"model": "openai/gpt-3.5-turbo",
2626
},
2727
}
2828

examples/anthropic/custom_graph_haiku.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040

4141
fetch_node = FetchNode(
4242
input="url | local_dir",
43-
output=["doc", "link_urls", "img_urls"],
43+
output=["doc"],
4444
node_config={
4545
"verbose": True,
4646
"headless": True,
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper while setting an API rate limit.
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
11+
# required environment variables in .env
12+
# ANTHROPIC_API_KEY
13+
load_dotenv()
14+
15+
# ************************************************
16+
# Create the SmartScraperGraph instance and run it
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"api_key": os.getenv("ANTHROPIC_API_KEY"),
22+
"model": "anthropic/claude-3-haiku-20240307",
23+
"rate_limit": {
24+
"requests_per_second": 1
25+
}
26+
},
27+
}
28+
29+
smart_scraper_graph = SmartScraperGraph(
30+
prompt="""Don't say anything else. Output JSON only. List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time,
31+
event_end_date, event_end_time, location, event_mode, event_category,
32+
third_party_redirect, no_of_days,
33+
time_in_hours, hosted_or_attending, refreshments_type,
34+
registration_available, registration_link""",
35+
# also accepts a string with the already downloaded HTML code
36+
source="https://www.hmhco.com/event",
37+
config=graph_config
38+
)
39+
40+
result = smart_scraper_graph.run()
41+
print(result)
42+
43+
# ************************************************
44+
# Get graph execution info
45+
# ************************************************
46+
47+
graph_exec_info = smart_scraper_graph.get_execution_info()
48+
print(prettify_exec_info(graph_exec_info))

examples/anthropic/search_graph_schema_haiku.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import os
66
from typing import List
77
from dotenv import load_dotenv
8-
from langchain_core.pydantic_v1 import BaseModel, Field
8+
from pydantic import BaseModel, Field
99
from scrapegraphai.graphs import SearchGraph
1010

1111
load_dotenv()
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperMultiConcatGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Create the SmartScraperGraph instance and run it
14+
# ************************************************
15+
16+
graph_config = {
17+
"llm": {
18+
"api_key": os.getenv("ANTHROPIC_API_KEY"),
19+
"model": "anthropic/claude-3-haiku-20240307",
20+
},
21+
}
22+
23+
24+
# *******************************************************
25+
# Create the SmartScraperMultiGraph instance and run it
26+
# *******************************************************
27+
28+
multiple_search_graph = SmartScraperMultiConcatGraph(
29+
prompt="Who is Marco Perini?",
30+
source= [
31+
"https://perinim.github.io/",
32+
"https://perinim.github.io/cv/"
33+
],
34+
schema=None,
35+
config=graph_config
36+
)
37+
38+
result = multiple_search_graph.run()
39+
print(json.dumps(result, indent=4))

examples/anthropic/smart_scraper_schema_haiku.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import os
66
from typing import List
7-
from langchain_core.pydantic_v1 import BaseModel, Field
7+
from pydantic import BaseModel, Field
88
from dotenv import load_dotenv
99
from scrapegraphai.graphs import SmartScraperGraph
1010
from scrapegraphai.utils import prettify_exec_info

examples/azure/rate_limit_azure.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper with a custom rate limit
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
11+
# required environment variable in .env
12+
# AZURE_OPENAI_ENDPOINT
13+
# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
14+
# MODEL_NAME
15+
# AZURE_OPENAI_API_KEY
16+
# OPENAI_API_TYPE
17+
# AZURE_OPENAI_API_VERSION
18+
# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME
19+
load_dotenv()
20+
21+
22+
# ************************************************
23+
# Initialize the model instances
24+
# ************************************************
25+
26+
graph_config = {
27+
"llm": {
28+
"api_key": os.environ["AZURE_OPENAI_KEY"],
29+
"model": "azure_openai/gpt-3.5-turbo",
30+
"rate_limit": {
31+
"requests_per_second": 1
32+
},
33+
},
34+
"verbose": True,
35+
"headless": False
36+
}
37+
38+
smart_scraper_graph = SmartScraperGraph(
39+
prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time,
40+
event_end_date, event_end_time, location, event_mode, event_category,
41+
third_party_redirect, no_of_days,
42+
time_in_hours, hosted_or_attending, refreshments_type,
43+
registration_available, registration_link""",
44+
# also accepts a string with the already downloaded HTML code
45+
source="https://www.hmhco.com/event",
46+
config=graph_config
47+
)
48+
49+
result = smart_scraper_graph.run()
50+
print(result)
51+
52+
# ************************************************
53+
# Get graph execution info
54+
# ************************************************
55+
56+
graph_exec_info = smart_scraper_graph.get_execution_info()
57+
print(prettify_exec_info(graph_exec_info))

examples/azure/search_graph_schema_azure.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from scrapegraphai.graphs import SearchGraph
1010
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
1111

12-
from langchain_core.pydantic_v1 import BaseModel, Field
12+
from pydantic import BaseModel, Field
1313
from typing import List
1414

1515
# ************************************************

examples/azure/smart_scraper_multi_azure.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
"""
22
Basic example of scraping pipeline using SmartScraper
33
"""
4-
5-
import os, json
4+
import os
5+
import json
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import SmartScraperMultiGraph
88

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperMultiConcatGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
graph_config = {
16+
"llm": {
17+
"api_key": os.environ["AZURE_OPENAI_KEY"],
18+
"model": "azure_openai/gpt-3.5-turbo",
19+
},
20+
"verbose": True,
21+
"headless": False
22+
}
23+
24+
# *******************************************************
25+
# Create the SmartScraperMultiGraph instance and run it
26+
# *******************************************************
27+
28+
multiple_search_graph = SmartScraperMultiConcatGraph(
29+
prompt="Who is Marco Perini?",
30+
source= [
31+
"https://perinim.github.io/",
32+
"https://perinim.github.io/cv/"
33+
],
34+
schema=None,
35+
config=graph_config
36+
)
37+
38+
result = multiple_search_graph.run()
39+
print(json.dumps(result, indent=4))

examples/azure/smart_scraper_schema_azure.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import os
66
import json
77
from typing import List
8-
from langchain_core.pydantic_v1 import BaseModel, Field
8+
from pydantic import BaseModel, Field
99
from dotenv import load_dotenv
1010
from scrapegraphai.graphs import SmartScraperGraph
1111

examples/bedrock/custom_graph_bedrock.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555

5656
fetch_node = FetchNode(
5757
input="url | local_dir",
58-
output=["doc", "link_urls", "img_urls"],
58+
output=["doc"],
5959
node_config={
6060
"verbose": True,
6161
"headless": True,
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper with a custom rate limit
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
graph_config = {
18+
"llm": {
19+
"client": "client_name",
20+
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
21+
"temperature": 0.0,
22+
"rate_limit": {
23+
"requests_per_second": 1
24+
},
25+
}
26+
}
27+
28+
# ************************************************
29+
# Create the SmartScraperGraph instance and run it
30+
# ************************************************
31+
32+
smart_scraper_graph = SmartScraperGraph(
33+
prompt="List me all the projects with their description",
34+
# also accepts a string with the already downloaded HTML code
35+
source="https://perinim.github.io/projects/",
36+
config=graph_config
37+
)
38+
39+
result = smart_scraper_graph.run()
40+
print(result)
41+
42+
# ************************************************
43+
# Get graph execution info
44+
# ************************************************
45+
46+
graph_exec_info = smart_scraper_graph.get_execution_info()
47+
print(prettify_exec_info(graph_exec_info))

examples/bedrock/search_graph_schema_bedrock.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from scrapegraphai.graphs import SearchGraph
55
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
66

7-
from langchain_core.pydantic_v1 import BaseModel, Field
7+
from pydantic import BaseModel, Field
88
from typing import List
99

1010
# ************************************************

0 commit comments

Comments
 (0)