Skip to content

Commit a2d5c75

Browse files
authored
Merge branch 'pre/beta' into main
2 parents 15be111 + c11331a commit a2d5c75

23 files changed

+771
-4
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ venv/
2828
*.sqlite
2929
*.google-cookie
3030
examples/graph_examples/ScrapeGraphAI_generated_graph
31-
examples/**/*.csv
31+
examples/**/result.csv
32+
examples/**/result.json
3233
main.py
3334
poetry.lock
3435

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
* **release:** 0.5.0-beta.5 [skip ci] ([5ac97e2](https://github.com/VinciGit00/Scrapegraph-ai/commit/5ac97e2fb321be40c9787fbf8cb53fa62cf0ce06))
5656
* **release:** 0.5.0-beta.6 [skip ci] ([9356124](https://github.com/VinciGit00/Scrapegraph-ai/commit/9356124ce39568e88f7d2965181579c4ff0a5752))
5757

58+
5859
## [0.5.0-beta.6](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.0-beta.5...v0.5.0-beta.6) (2024-04-30)
5960

6061

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from langchain_openai import AzureChatOpenAI
8+
from langchain_openai import AzureOpenAIEmbeddings
9+
from scrapegraphai.graphs import SmartScraperGraph
10+
from scrapegraphai.utils import prettify_exec_info
11+
12+
13+
## required environment variable in .env
14+
# AZURE_OPENAI_ENDPOINT
15+
# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
16+
# MODEL_NAME
17+
# AZURE_OPENAI_API_KEY
18+
# OPENAI_API_TYPE
19+
# AZURE_OPENAI_API_VERSION
20+
# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME
21+
load_dotenv()
22+
23+
24+
# ************************************************
25+
# Initialize the model instances
26+
# ************************************************
27+
28+
llm_model_instance = AzureChatOpenAI(
29+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
30+
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
31+
)
32+
33+
embedder_model_instance = AzureOpenAIEmbeddings(
34+
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
35+
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
36+
)
37+
38+
# ************************************************
39+
# Create the SmartScraperGraph instance and run it
40+
# ************************************************
41+
42+
graph_config = {
43+
"llm": {"model_instance": llm_model_instance},
44+
"embeddings": {"model_instance": embedder_model_instance}
45+
}
46+
47+
smart_scraper_graph = SmartScraperGraph(
48+
prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days,
49+
time_in_hours, hosted_or_attending, refreshments_type, registration_available, registration_link",
50+
# also accepts a string with the already downloaded HTML code
51+
source="https://www.hmhco.com/event",
52+
config=graph_config
53+
)
54+
55+
result = smart_scraper_graph.run()
56+
print(result)
57+
58+
# ************************************************
59+
# Get graph execution info
60+
# ************************************************
61+
62+
graph_exec_info = smart_scraper_graph.get_execution_info()
63+
print(prettify_exec_info(graph_exec_info))

examples/gemini/csv_scraper_gemini.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
import pandas as pd
8+
from scrapegraphai.graphs import CSVScraperGraph
9+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
10+
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Read the csv file
15+
# ************************************************
16+
17+
text = pd.read_csv("inputs/username.csv")
18+
19+
# ************************************************
20+
# Define the configuration for the graph
21+
# ************************************************
22+
23+
graph_config = {
24+
"llm": {
25+
"model": "ollama/mistral",
26+
"temperature": 0,
27+
"format": "json", # Ollama needs the format to be specified explicitly
28+
# "model_tokens": 2000, # set context length arbitrarily
29+
"base_url": "http://localhost:11434",
30+
},
31+
"embeddings": {
32+
"model": "ollama/nomic-embed-text",
33+
"temperature": 0,
34+
"base_url": "http://localhost:11434",
35+
}
36+
}
37+
38+
# ************************************************
39+
# Create the CSVScraperGraph instance and run it
40+
# ************************************************
41+
42+
csv_scraper_graph = CSVScraperGraph(
43+
prompt="List me all the last names",
44+
source=str(text), # Pass the content of the file, not the file object
45+
config=graph_config
46+
)
47+
48+
result = csv_scraper_graph.run()
49+
print(result)
50+
51+
# ************************************************
52+
# Get graph execution info
53+
# ************************************************
54+
55+
graph_exec_info = csv_scraper_graph.get_execution_info()
56+
print(prettify_exec_info(graph_exec_info))
57+
58+
# Save to json or csv
59+
convert_to_csv(result, "result")
60+
convert_to_json(result, "result")

examples/gemini/inputs/username.csv

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

examples/gemini/scrape_xml_gemini.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import SmartScraperGraph
88
from scrapegraphai.utils import prettify_exec_info
9+
910
load_dotenv()
1011

1112
# ************************************************
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import pandas as pd
6+
from scrapegraphai.graphs import CSVScraperGraph
7+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
8+
9+
# ************************************************
10+
# Read the csv file
11+
# ************************************************
12+
13+
text = pd.read_csv("inputs/username.csv")
14+
15+
# ************************************************
16+
# Define the configuration for the graph
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"model": "ollama/mistral",
22+
"temperature": 0,
23+
"format": "json", # Ollama needs the format to be specified explicitly
24+
# "model_tokens": 2000, # set context length arbitrarily
25+
},
26+
"embeddings": {
27+
"model": "ollama/nomic-embed-text",
28+
"temperature": 0,
29+
}
30+
}
31+
32+
# ************************************************
33+
# Create the CSVScraperGraph instance and run it
34+
# ************************************************
35+
36+
csv_scraper_graph = CSVScraperGraph(
37+
prompt="List me all the last names",
38+
source=str(text), # Pass the content of the file, not the file object
39+
config=graph_config
40+
)
41+
42+
result = csv_scraper_graph.run()
43+
print(result)
44+
45+
# ************************************************
46+
# Get graph execution info
47+
# ************************************************
48+
49+
graph_exec_info = csv_scraper_graph.get_execution_info()
50+
print(prettify_exec_info(graph_exec_info))
51+
52+
# Save to json or csv
53+
convert_to_csv(result, "result")
54+
convert_to_json(result, "result")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import pandas as pd
6+
from scrapegraphai.graphs import CSVScraperGraph
7+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
8+
9+
# ************************************************
10+
# Read the csv file
11+
# ************************************************
12+
13+
text = pd.read_csv("inputs/username.csv")
14+
15+
# ************************************************
16+
# Define the configuration for the graph
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"model": "ollama/mistral",
22+
"temperature": 0,
23+
"format": "json", # Ollama needs the format to be specified explicitly
24+
# "model_tokens": 2000, # set context length arbitrarily
25+
"base_url": "http://localhost:11434",
26+
},
27+
"embeddings": {
28+
"model": "ollama/nomic-embed-text",
29+
"temperature": 0,
30+
"base_url": "http://localhost:11434",
31+
}
32+
}
33+
34+
# ************************************************
35+
# Create the CSVScraperGraph instance and run it
36+
# ************************************************
37+
38+
csv_scraper_graph = CSVScraperGraph(
39+
prompt="List me all the last names",
40+
source=str(text), # Pass the content of the file, not the file object
41+
config=graph_config
42+
)
43+
44+
result = csv_scraper_graph.run()
45+
print(result)
46+
47+
# ************************************************
48+
# Get graph execution info
49+
# ************************************************
50+
51+
graph_exec_info = csv_scraper_graph.get_execution_info()
52+
print(prettify_exec_info(graph_exec_info))
53+
54+
# Save to json or csv
55+
convert_to_csv(result, "result")
56+
convert_to_json(result, "result")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

examples/openai/csv_scraper_openai.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
import pandas as pd
8+
from scrapegraphai.graphs import CSVScraperGraph
9+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
10+
11+
load_dotenv()
12+
# ************************************************
13+
# Read the csv file
14+
# ************************************************
15+
16+
text = pd.read_csv("inputs/username.csv")
17+
18+
# ************************************************
19+
# Define the configuration for the graph
20+
# ************************************************
21+
22+
openai_key = os.getenv("OPENAI_APIKEY")
23+
24+
graph_config = {
25+
"llm": {
26+
"api_key": openai_key,
27+
"model": "gpt-3.5-turbo",
28+
},
29+
}
30+
31+
# ************************************************
32+
# Create the CSVScraperGraph instance and run it
33+
# ************************************************
34+
35+
csv_scraper_graph = CSVScraperGraph(
36+
prompt="List me all the last names",
37+
source=str(text), # Pass the content of the file, not the file object
38+
config=graph_config
39+
)
40+
41+
result = csv_scraper_graph.run()
42+
print(result)
43+
44+
# ************************************************
45+
# Get graph execution info
46+
# ************************************************
47+
48+
graph_exec_info = csv_scraper_graph.get_execution_info()
49+
print(prettify_exec_info(graph_exec_info))
50+
51+
# Save to json or csv
52+
convert_to_csv(result, "result")
53+
convert_to_json(result, "result")

examples/openai/inputs/username.csv

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

examples/openai/scrape_plain_text_openai.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import SmartScraperGraph
88
from scrapegraphai.utils import prettify_exec_info
9+
910
load_dotenv()
1011

1112
# ************************************************

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ name = "scrapegraphai"
33

44
version = "0.5.2"
55

6+
67
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
78
authors = [
89
"Marco Vinciguerra <[email protected]>",

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@
99
from .script_creator_graph import ScriptCreatorGraph
1010
from .xml_scraper_graph import XMLScraperGraph
1111
from .json_scraper_graph import JSONScraperGraph
12+
from .csv_scraper_graph import CSVScraperGraph

0 commit comments

Comments
 (0)