Skip to content

Commit 62b762c

Browse files
committed
2 parents c8eeff8 + 37517ae commit 62b762c

25 files changed

+824
-7
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ venv/
2828
*.sqlite
2929
*.google-cookie
3030
examples/graph_examples/ScrapeGraphAI_generated_graph
31-
examples/**/*.csv
31+
examples/**/result.csv
32+
examples/**/result.json
3233
main.py
3334
poetry.lock
3435

CHANGELOG.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,25 @@
1+
## [0.6.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1) (2024-05-02)
2+
3+
4+
### Bug Fixes
5+
6+
* gemini errror ([2ea54ea](https://github.com/VinciGit00/Scrapegraph-ai/commit/2ea54eab1d070e177c7d5ecfcc032b325fbd7c12))
7+
8+
## [0.6.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.2...v0.6.0) (2024-05-02)
9+
10+
11+
### Features
12+
13+
* added node and graph for CSV scraping ([4d542a8](https://github.com/VinciGit00/Scrapegraph-ai/commit/4d542a88f7d949a5ba360dcd880716c8110a5d14))
14+
* Allow end users to pass model instances for llm and embedding model ([b86aac2](https://github.com/VinciGit00/Scrapegraph-ai/commit/b86aac2188887642564a34d13d55d0fcff220ec1))
15+
* modified node name ([02d1af0](https://github.com/VinciGit00/Scrapegraph-ai/commit/02d1af006cb89bf860ee4f1186f582e2049a8e3d))
16+
17+
18+
### CI
19+
20+
* **release:** 0.5.0-beta.7 [skip ci] ([40b2a34](https://github.com/VinciGit00/Scrapegraph-ai/commit/40b2a346d57865ca21915ecaa658096c52a2cc6b))
21+
* **release:** 0.5.0-beta.8 [skip ci] ([c11331a](https://github.com/VinciGit00/Scrapegraph-ai/commit/c11331a26ac325dfcf489272442ceeed13225a39))
22+
123
## [0.5.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.1...v0.5.2) (2024-05-02)
224

325

@@ -55,6 +77,7 @@
5577
* **release:** 0.5.0-beta.5 [skip ci] ([5ac97e2](https://github.com/VinciGit00/Scrapegraph-ai/commit/5ac97e2fb321be40c9787fbf8cb53fa62cf0ce06))
5678
* **release:** 0.5.0-beta.6 [skip ci] ([9356124](https://github.com/VinciGit00/Scrapegraph-ai/commit/9356124ce39568e88f7d2965181579c4ff0a5752))
5779

80+
5881
## [0.5.0-beta.6](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.0-beta.5...v0.5.0-beta.6) (2024-04-30)
5982

6083

README.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,38 @@ result = smart_scraper_graph.run()
168168
print(result)
169169
```
170170

171-
### Case 5: Extracting information using Gemini
171+
172+
### Case 5: Extracting information using Azure
173+
```python
174+
from langchain_openai import AzureChatOpenAI
175+
from langchain_openai import AzureOpenAIEmbeddings
176+
177+
lm_model_instance = AzureChatOpenAI(
178+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
179+
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
180+
)
181+
182+
embedder_model_instance = AzureOpenAIEmbeddings(
183+
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
184+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
185+
)
186+
graph_config = {
187+
"llm": {"model_instance": llm_model_instance},
188+
"embeddings": {"model_instance": embedder_model_instance}
189+
}
190+
191+
smart_scraper_graph = SmartScraperGraph(
192+
prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time,
193+
event_end_date, event_end_time, location, event_mode, event_category,
194+
third_party_redirect, no_of_days,
195+
time_in_hours, hosted_or_attending, refreshments_type,
196+
registration_available, registration_link""",
197+
source="https://www.hmhco.com/event",
198+
config=graph_config
199+
)
200+
```
201+
202+
### Case 6: Extracting information using Gemini
172203
```python
173204
from scrapegraphai.graphs import SmartScraperGraph
174205
GOOGLE_APIKEY = "YOUR_API_KEY"
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from langchain_openai import AzureChatOpenAI
8+
from langchain_openai import AzureOpenAIEmbeddings
9+
from scrapegraphai.graphs import SmartScraperGraph
10+
from scrapegraphai.utils import prettify_exec_info
11+
12+
13+
# required environment variable in .env
14+
# AZURE_OPENAI_ENDPOINT
15+
# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
16+
# MODEL_NAME
17+
# AZURE_OPENAI_API_KEY
18+
# OPENAI_API_TYPE
19+
# AZURE_OPENAI_API_VERSION
20+
# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME
21+
load_dotenv()
22+
23+
24+
# ************************************************
25+
# Initialize the model instances
26+
# ************************************************
27+
28+
llm_model_instance = AzureChatOpenAI(
29+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
30+
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
31+
)
32+
33+
embedder_model_instance = AzureOpenAIEmbeddings(
34+
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
35+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
36+
)
37+
38+
# ************************************************
39+
# Create the SmartScraperGraph instance and run it
40+
# ************************************************
41+
42+
graph_config = {
43+
"llm": {"model_instance": llm_model_instance},
44+
"embeddings": {"model_instance": embedder_model_instance}
45+
}
46+
47+
smart_scraper_graph = SmartScraperGraph(
48+
prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time,
49+
event_end_date, event_end_time, location, event_mode, event_category,
50+
third_party_redirect, no_of_days,
51+
time_in_hours, hosted_or_attending, refreshments_type,
52+
registration_available, registration_link""",
53+
# also accepts a string with the already downloaded HTML code
54+
source="https://www.hmhco.com/event",
55+
config=graph_config
56+
)
57+
58+
result = smart_scraper_graph.run()
59+
print(result)
60+
61+
# ************************************************
62+
# Get graph execution info
63+
# ************************************************
64+
65+
graph_exec_info = smart_scraper_graph.get_execution_info()
66+
print(prettify_exec_info(graph_exec_info))

examples/gemini/csv_scraper_gemini.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
import pandas as pd
8+
from scrapegraphai.graphs import CSVScraperGraph
9+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
10+
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Read the csv file
15+
# ************************************************
16+
17+
text = pd.read_csv("inputs/username.csv")
18+
19+
# ************************************************
20+
# Define the configuration for the graph
21+
# ************************************************
22+
gemini_key = os.getenv("GOOGLE_APIKEY")
23+
24+
graph_config = {
25+
"llm": {
26+
"api_key": gemini_key,
27+
"model": "gemini-pro",
28+
},
29+
}
30+
31+
# ************************************************
32+
# Create the CSVScraperGraph instance and run it
33+
# ************************************************
34+
35+
csv_scraper_graph = CSVScraperGraph(
36+
prompt="List me all the last names",
37+
source=str(text), # Pass the content of the file, not the file object
38+
config=graph_config
39+
)
40+
41+
result = csv_scraper_graph.run()
42+
print(result)
43+
44+
# ************************************************
45+
# Get graph execution info
46+
# ************************************************
47+
48+
graph_exec_info = csv_scraper_graph.get_execution_info()
49+
print(prettify_exec_info(graph_exec_info))
50+
51+
# Save to json or csv
52+
convert_to_csv(result, "result")
53+
convert_to_json(result, "result")

examples/gemini/inputs/username.csv

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

examples/gemini/scrape_xml_gemini.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import SmartScraperGraph
88
from scrapegraphai.utils import prettify_exec_info
9+
910
load_dotenv()
1011

1112
# ************************************************

examples/gemini/smart_scraper_gemini.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
graph_config = {
1919
"llm": {
2020
"api_key": gemini_key,
21-
"model": "gpt-3.5-turbo",
21+
"model": "gemini-pro",
2222
},
2323
}
2424

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import pandas as pd
6+
from scrapegraphai.graphs import CSVScraperGraph
7+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
8+
9+
# ************************************************
10+
# Read the csv file
11+
# ************************************************
12+
13+
text = pd.read_csv("inputs/username.csv")
14+
15+
# ************************************************
16+
# Define the configuration for the graph
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"model": "ollama/mistral",
22+
"temperature": 0,
23+
"format": "json", # Ollama needs the format to be specified explicitly
24+
# "model_tokens": 2000, # set context length arbitrarily
25+
},
26+
"embeddings": {
27+
"model": "ollama/nomic-embed-text",
28+
"temperature": 0,
29+
}
30+
}
31+
32+
# ************************************************
33+
# Create the CSVScraperGraph instance and run it
34+
# ************************************************
35+
36+
csv_scraper_graph = CSVScraperGraph(
37+
prompt="List me all the last names",
38+
source=str(text), # Pass the content of the file, not the file object
39+
config=graph_config
40+
)
41+
42+
result = csv_scraper_graph.run()
43+
print(result)
44+
45+
# ************************************************
46+
# Get graph execution info
47+
# ************************************************
48+
49+
graph_exec_info = csv_scraper_graph.get_execution_info()
50+
print(prettify_exec_info(graph_exec_info))
51+
52+
# Save to json or csv
53+
convert_to_csv(result, "result")
54+
convert_to_json(result, "result")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import pandas as pd
6+
from scrapegraphai.graphs import CSVScraperGraph
7+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
8+
9+
# ************************************************
10+
# Read the csv file
11+
# ************************************************
12+
13+
text = pd.read_csv("inputs/username.csv")
14+
15+
# ************************************************
16+
# Define the configuration for the graph
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"model": "ollama/mistral",
22+
"temperature": 0,
23+
"format": "json", # Ollama needs the format to be specified explicitly
24+
# "model_tokens": 2000, # set context length arbitrarily
25+
"base_url": "http://localhost:11434",
26+
},
27+
"embeddings": {
28+
"model": "ollama/nomic-embed-text",
29+
"temperature": 0,
30+
"base_url": "http://localhost:11434",
31+
}
32+
}
33+
34+
# ************************************************
35+
# Create the CSVScraperGraph instance and run it
36+
# ************************************************
37+
38+
csv_scraper_graph = CSVScraperGraph(
39+
prompt="List me all the last names",
40+
source=str(text), # Pass the content of the file, not the file object
41+
config=graph_config
42+
)
43+
44+
result = csv_scraper_graph.run()
45+
print(result)
46+
47+
# ************************************************
48+
# Get graph execution info
49+
# ************************************************
50+
51+
graph_exec_info = csv_scraper_graph.get_execution_info()
52+
print(prettify_exec_info(graph_exec_info))
53+
54+
# Save to json or csv
55+
convert_to_csv(result, "result")
56+
convert_to_json(result, "result")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

0 commit comments

Comments
 (0)