ScrapeGraphAI
diff --git a/‎.gitignore
Lines changed: 2 additions & 1 deletion b/‎.gitignore
Lines changed: 2 additions & 1 deletion
diff --git a/‎CHANGELOG.md
Lines changed: 23 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 23 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 32 additions & 1 deletion b/‎README.md
Lines changed: 32 additions & 1 deletion
diff --git a/‎examples/azure/smart_scraper_azure_openai.py
Lines changed: 66 additions & 0 deletions b/‎examples/azure/smart_scraper_azure_openai.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎examples/gemini/csv_scraper_gemini.py
Lines changed: 53 additions & 0 deletions b/‎examples/gemini/csv_scraper_gemini.py
Lines changed: 53 additions & 0 deletions
diff --git a/‎examples/gemini/inputs/username.csv
Lines changed: 7 additions & 0 deletions b/‎examples/gemini/inputs/username.csv
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/gemini/scrape_xml_gemini.py
Lines changed: 1 addition & 0 deletions b/‎examples/gemini/scrape_xml_gemini.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/gemini/smart_scraper_gemini.py
Lines changed: 1 addition & 1 deletion b/‎examples/gemini/smart_scraper_gemini.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/local_models/Docker/csv_scraper_docker.py
Lines changed: 54 additions & 0 deletions b/‎examples/local_models/Docker/csv_scraper_docker.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎examples/local_models/Docker/inputs/username.csv
Lines changed: 7 additions & 0 deletions b/‎examples/local_models/Docker/inputs/username.csv
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/local_models/Ollama/csv_scraper_ollama.py
Lines changed: 56 additions & 0 deletions b/‎examples/local_models/Ollama/csv_scraper_ollama.py
Lines changed: 56 additions & 0 deletions
diff --git a/‎examples/local_models/Ollama/inputs/username.csv
Lines changed: 7 additions & 0 deletions b/‎examples/local_models/Ollama/inputs/username.csv
Lines changed: 7 additions & 0 deletions
@@ -28,7 +28,8 @@ venv/
 *.sqlite
 *.google-cookie
 examples/graph_examples/ScrapeGraphAI_generated_graph
-examples/**/*.csv
+examples/**/result.csv
+examples/**/result.json
 main.py
 poetry.lock
 
 
@@ -1,3 +1,25 @@
+## [0.6.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1) (2024-05-02)
+
+
+### Bug Fixes
+
+* gemini errror ([2ea54ea](https://github.com/VinciGit00/Scrapegraph-ai/commit/2ea54eab1d070e177c7d5ecfcc032b325fbd7c12))
+
+## [0.6.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.2...v0.6.0) (2024-05-02)
+
+
+### Features
+
+* added node and graph for CSV scraping ([4d542a8](https://github.com/VinciGit00/Scrapegraph-ai/commit/4d542a88f7d949a5ba360dcd880716c8110a5d14))
+* Allow end users to pass model instances for llm and embedding model ([b86aac2](https://github.com/VinciGit00/Scrapegraph-ai/commit/b86aac2188887642564a34d13d55d0fcff220ec1))
+* modified node name ([02d1af0](https://github.com/VinciGit00/Scrapegraph-ai/commit/02d1af006cb89bf860ee4f1186f582e2049a8e3d))
+
+
+### CI
+
+* **release:** 0.5.0-beta.7 [skip ci] ([40b2a34](https://github.com/VinciGit00/Scrapegraph-ai/commit/40b2a346d57865ca21915ecaa658096c52a2cc6b))
+* **release:** 0.5.0-beta.8 [skip ci] ([c11331a](https://github.com/VinciGit00/Scrapegraph-ai/commit/c11331a26ac325dfcf489272442ceeed13225a39))
+
 ## [0.5.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.1...v0.5.2) (2024-05-02)
 
 
@@ -55,6 +77,7 @@
 * **release:** 0.5.0-beta.5 [skip ci] ([5ac97e2](https://github.com/VinciGit00/Scrapegraph-ai/commit/5ac97e2fb321be40c9787fbf8cb53fa62cf0ce06))
 * **release:** 0.5.0-beta.6 [skip ci] ([9356124](https://github.com/VinciGit00/Scrapegraph-ai/commit/9356124ce39568e88f7d2965181579c4ff0a5752))
 
+
 ## [0.5.0-beta.6](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.0-beta.5...v0.5.0-beta.6) (2024-04-30)
 
 
 
@@ -168,7 +168,38 @@ result = smart_scraper_graph.run()
 print(result)
 ```
 
-### Case 5: Extracting information using Gemini 
+
+### Case 5: Extracting information using Azure
+```python
+from langchain_openai import AzureChatOpenAI
+from langchain_openai import AzureOpenAIEmbeddings
+
+lm_model_instance = AzureChatOpenAI(
+    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
+    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
+)
+
+embedder_model_instance = AzureOpenAIEmbeddings(
+    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
+    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
+)
+graph_config = {
+    "llm": {"model_instance": llm_model_instance},
+    "embeddings": {"model_instance": embedder_model_instance}
+}
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, 
+    event_end_date, event_end_time, location, event_mode, event_category, 
+    third_party_redirect, no_of_days, 
+    time_in_hours, hosted_or_attending, refreshments_type, 
+    registration_available, registration_link""",
+    source="https://www.hmhco.com/event",
+    config=graph_config
+)
+```
+
+### Case 6: Extracting information using Gemini 
 ```python
 from scrapegraphai.graphs import SmartScraperGraph
 GOOGLE_APIKEY = "YOUR_API_KEY"
 
@@ -0,0 +1,66 @@
+""" 
+Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key
+"""
+
+import os
+from dotenv import load_dotenv
+from langchain_openai import AzureChatOpenAI
+from langchain_openai import AzureOpenAIEmbeddings
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+
+# required environment variable in .env
+# AZURE_OPENAI_ENDPOINT
+# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
+# MODEL_NAME
+# AZURE_OPENAI_API_KEY
+# OPENAI_API_TYPE
+# AZURE_OPENAI_API_VERSION
+# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME
+load_dotenv()
+
+
+# ************************************************
+# Initialize the model instances
+# ************************************************
+
+llm_model_instance = AzureChatOpenAI(
+    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
+    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
+)
+
+embedder_model_instance = AzureOpenAIEmbeddings(
+    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
+    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
+)
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+graph_config = {
+    "llm": {"model_instance": llm_model_instance},
+    "embeddings": {"model_instance": embedder_model_instance}
+}
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, 
+    event_end_date, event_end_time, location, event_mode, event_category, 
+    third_party_redirect, no_of_days, 
+    time_in_hours, hosted_or_attending, refreshments_type, 
+    registration_available, registration_link""",
+    # also accepts a string with the already downloaded HTML code
+    source="https://www.hmhco.com/event",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
@@ -0,0 +1,53 @@
+"""
+Basic example of scraping pipeline using CSVScraperGraph from CSV documents
+"""
+
+import os
+from dotenv import load_dotenv
+import pandas as pd
+from scrapegraphai.graphs import CSVScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Read the csv file
+# ************************************************
+
+text = pd.read_csv("inputs/username.csv")
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": gemini_key,
+        "model": "gemini-pro",
+    },
+}
+
+# ************************************************
+# Create the CSVScraperGraph instance and run it
+# ************************************************
+
+csv_scraper_graph = CSVScraperGraph(
+    prompt="List me all the last names",
+    source=str(text),  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = csv_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = csv_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
@@ -0,0 +1,7 @@
+Username; Identifier;First name;Last name
+booker12;9012;Rachel;Booker
+grey07;2070;Laura;Grey
+johnson81;4081;Craig;Johnson
+jenkins46;9346;Mary;Jenkins
+smith79;5079;Jamie;Smith
+
@@ -6,6 +6,7 @@
 from dotenv import load_dotenv
 from scrapegraphai.graphs import SmartScraperGraph
 from scrapegraphai.utils import prettify_exec_info
+
 load_dotenv()
 
 # ************************************************
 
@@ -18,7 +18,7 @@
 graph_config = {
     "llm": {
         "api_key": gemini_key,
-        "model": "gpt-3.5-turbo",
+        "model": "gemini-pro",
     },
 }
 
 
@@ -0,0 +1,54 @@
+"""
+Basic example of scraping pipeline using CSVScraperGraph from CSV documents
+"""
+
+import pandas as pd
+from scrapegraphai.graphs import CSVScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Read the csv file
+# ************************************************
+
+text = pd.read_csv("inputs/username.csv")
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+    }
+}
+
+# ************************************************
+# Create the CSVScraperGraph instance and run it
+# ************************************************
+
+csv_scraper_graph = CSVScraperGraph(
+    prompt="List me all the last names",
+    source=str(text),  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = csv_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = csv_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
@@ -0,0 +1,7 @@
+Username; Identifier;First name;Last name
+booker12;9012;Rachel;Booker
+grey07;2070;Laura;Grey
+johnson81;4081;Craig;Johnson
+jenkins46;9346;Mary;Jenkins
+smith79;5079;Jamie;Smith
+
@@ -0,0 +1,56 @@
+"""
+Basic example of scraping pipeline using CSVScraperGraph from CSV documents
+"""
+
+import pandas as pd
+from scrapegraphai.graphs import CSVScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Read the csv file
+# ************************************************
+
+text = pd.read_csv("inputs/username.csv")
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+        "base_url": "http://localhost:11434",
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",
+    }
+}
+
+# ************************************************
+# Create the CSVScraperGraph instance and run it
+# ************************************************
+
+csv_scraper_graph = CSVScraperGraph(
+    prompt="List me all the last names",
+    source=str(text),  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = csv_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = csv_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
@@ -0,0 +1,7 @@
+Username; Identifier;First name;Last name
+booker12;9012;Rachel;Booker
+grey07;2070;Laura;Grey
+johnson81;4081;Craig;Johnson
+jenkins46;9346;Mary;Jenkins
+smith79;5079;Jamie;Smith
+
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`graph_config = {`
`19`	`19`	`"llm": {`
`20`	`20`	`"api_key": gemini_key,`
`21`		`- "model": "gpt-3.5-turbo",`
	`21`	`+ "model": "gemini-pro",`
`22`	`22`	`},`
`23`	`23`	`}`
`24`	`24`