ScrapeGraphAI
diff --git a/‎CHANGELOG.md
Lines changed: 233 additions & 2 deletions b/‎CHANGELOG.md
Lines changed: 233 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 11 additions & 3 deletions b/‎README.md
Lines changed: 11 additions & 3 deletions
diff --git a/‎docs/chinese.md
Lines changed: 1 addition & 1 deletion b/‎docs/chinese.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/japanese.md
Lines changed: 1 addition & 1 deletion b/‎docs/japanese.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/korean.md
Lines changed: 1 addition & 1 deletion b/‎docs/korean.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/russian.md
Lines changed: 1 addition & 1 deletion b/‎docs/russian.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/getting_started/examples.rst
Lines changed: 1 addition & 1 deletion b/‎docs/source/getting_started/examples.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/anthropic/custom_graph_haiku.py
Lines changed: 1 addition & 1 deletion b/‎examples/anthropic/custom_graph_haiku.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/anthropic/rate_limit_haiku.py
Lines changed: 48 additions & 0 deletions b/‎examples/anthropic/rate_limit_haiku.py
Lines changed: 48 additions & 0 deletions
diff --git a/‎examples/anthropic/smart_scraper_multi_concat_haiku.py
Lines changed: 39 additions & 0 deletions b/‎examples/anthropic/smart_scraper_multi_concat_haiku.py
Lines changed: 39 additions & 0 deletions
diff --git a/‎examples/azure/rate_limit_azure.py
Lines changed: 57 additions & 0 deletions b/‎examples/azure/rate_limit_azure.py
Lines changed: 57 additions & 0 deletions
diff --git a/‎examples/azure/smart_scraper_multi_azure.py
Lines changed: 2 additions & 2 deletions b/‎examples/azure/smart_scraper_multi_azure.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/azure/smart_scraper_multi_concat_azure.py
Lines changed: 39 additions & 0 deletions b/‎examples/azure/smart_scraper_multi_concat_azure.py
Lines changed: 39 additions & 0 deletions
diff --git a/‎examples/bedrock/custom_graph_bedrock.py
Lines changed: 1 addition & 1 deletion b/‎examples/bedrock/custom_graph_bedrock.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/bedrock/rate_limit_bedrock.py
Lines changed: 47 additions & 0 deletions b/‎examples/bedrock/rate_limit_bedrock.py
Lines changed: 47 additions & 0 deletions
diff --git a/‎examples/bedrock/smart_scraper_multi_bedrock.py
Lines changed: 1 addition & 4 deletions b/‎examples/bedrock/smart_scraper_multi_bedrock.py
Lines changed: 1 addition & 4 deletions
diff --git a/‎examples/bedrock/smart_scraper_multi_concat_bedrock.py
Lines changed: 35 additions & 0 deletions b/‎examples/bedrock/smart_scraper_multi_concat_bedrock.py
Lines changed: 35 additions & 0 deletions
diff --git a/‎examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py
Lines changed: 1 addition & 1 deletion b/‎examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py
Lines changed: 1 addition & 1 deletion
@@ -38,9 +38,10 @@ Additional dependecies can be added while installing the library:
 
 - <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
 
-  ```bash
-  pip install scrapegraphai[other-language-models]
-  ```
+
+This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
+```bash
+pip install scrapegraphai[other-language-models]
 
 - <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.
 
@@ -58,6 +59,13 @@ Additional dependecies can be added while installing the library:
 
 
 
+### Installing "More Browser Options"
+
+This group includes an ocr scraper for websites
+```bash
+pip install scrapegraphai[screenshot_scraper]
+```
+
 ## 💻 Usage
 There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
 
 
@@ -133,7 +133,7 @@ from scrapegraphai.graphs import SpeechGraph
 graph_config = {
     "llm": {
         "api_key": "OPENAI_API_KEY",
-        "model": "gpt-3.5-turbo",
+        "model": "openai/gpt-3.5-turbo",
     },
     "tts_model": {
         "api_key": "OPENAI_API_KEY",
 
@@ -133,7 +133,7 @@ from scrapegraphai.graphs import SpeechGraph
 graph_config = {
     "llm": {
         "api_key": "OPENAI_API_KEY",
-        "model": "gpt-3.5-turbo",
+        "model": "openai/gpt-3.5-turbo",
     },
     "tts_model": {
         "api_key": "OPENAI_API_KEY",
 
@@ -132,7 +132,7 @@ from scrapegraphai.graphs import SpeechGraph
 graph_config = {
     "llm": {
         "api_key": "OPENAI_API_KEY",
-        "model": "gpt-3.5-turbo",
+        "model": "openai/gpt-3.5-turbo",
     },
     "tts_model": {
         "api_key": "OPENAI_API_KEY",
 
@@ -138,7 +138,7 @@ from scrapegraphai.graphs import SpeechGraph
 graph_config = {
     "llm": {
         "api_key": "OPENAI_API_KEY",
-        "model": "gpt-3.5-turbo",
+        "model": "openai/gpt-3.5-turbo",
     },
     "tts_model": {
         "api_key": "OPENAI_API_KEY",
 
@@ -22,7 +22,7 @@ OpenAI models
    graph_config = {
       "llm": {
          "api_key": openai_key,
-         "model": "gpt-3.5-turbo",
+         "model": "openai/gpt-3.5-turbo",
       },
    }
 
 
@@ -40,7 +40,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
 
@@ -0,0 +1,48 @@
+""" 
+Basic example of scraping pipeline using SmartScraper while setting an API rate limit.
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+
+# required environment variables in .env
+# ANTHROPIC_API_KEY
+load_dotenv()
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("ANTHROPIC_API_KEY"),
+        "model": "anthropic/claude-3-haiku-20240307",
+        "rate_limit": {
+            "requests_per_second": 1
+        }
+    },
+}
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="""Don't say anything else. Output JSON only. List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, 
+    event_end_date, event_end_time, location, event_mode, event_category, 
+    third_party_redirect, no_of_days, 
+    time_in_hours, hosted_or_attending, refreshments_type, 
+    registration_available, registration_link""",
+    # also accepts a string with the already downloaded HTML code
+    source="https://www.hmhco.com/event",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
@@ -0,0 +1,39 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperMultiConcatGraph
+
+load_dotenv()
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("ANTHROPIC_API_KEY"),
+        "model": "anthropic/claude-3-haiku-20240307",
+    },
+}
+
+
+# *******************************************************
+# Create the SmartScraperMultiGraph instance and run it
+# *******************************************************
+
+multiple_search_graph = SmartScraperMultiConcatGraph(
+    prompt="Who is Marco Perini?",
+    source= [
+        "https://perinim.github.io/",
+        "https://perinim.github.io/cv/"
+        ],
+    schema=None,
+    config=graph_config
+)
+
+result = multiple_search_graph.run()
+print(json.dumps(result, indent=4))
@@ -0,0 +1,57 @@
+""" 
+Basic example of scraping pipeline using SmartScraper with a custom rate limit
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+
+# required environment variable in .env
+# AZURE_OPENAI_ENDPOINT
+# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
+# MODEL_NAME
+# AZURE_OPENAI_API_KEY
+# OPENAI_API_TYPE
+# AZURE_OPENAI_API_VERSION
+# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME
+load_dotenv()
+
+
+# ************************************************
+# Initialize the model instances
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "api_key": os.environ["AZURE_OPENAI_KEY"],
+        "model": "azure_openai/gpt-3.5-turbo",
+        "rate_limit": {
+            "requests_per_second": 1
+        },
+    },
+    "verbose": True,
+    "headless": False
+}
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, 
+    event_end_date, event_end_time, location, event_mode, event_category, 
+    third_party_redirect, no_of_days, 
+    time_in_hours, hosted_or_attending, refreshments_type, 
+    registration_available, registration_link""",
+    # also accepts a string with the already downloaded HTML code
+    source="https://www.hmhco.com/event",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
@@ -1,8 +1,8 @@
 """ 
 Basic example of scraping pipeline using SmartScraper
 """
-
-import os, json
+import os
+import json
 from dotenv import load_dotenv
 from scrapegraphai.graphs import SmartScraperMultiGraph
 
 
@@ -0,0 +1,39 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperMultiConcatGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+graph_config = {
+    "llm": {
+        "api_key": os.environ["AZURE_OPENAI_KEY"],
+        "model": "azure_openai/gpt-3.5-turbo",
+    },
+    "verbose": True,
+    "headless": False
+}
+
+# *******************************************************
+# Create the SmartScraperMultiGraph instance and run it
+# *******************************************************
+
+multiple_search_graph = SmartScraperMultiConcatGraph(
+    prompt="Who is Marco Perini?",
+    source= [
+        "https://perinim.github.io/",
+        "https://perinim.github.io/cv/"
+        ],
+    schema=None,
+    config=graph_config
+)
+
+result = multiple_search_graph.run()
+print(json.dumps(result, indent=4))
@@ -55,7 +55,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
 
@@ -0,0 +1,47 @@
+""" 
+Basic example of scraping pipeline using SmartScraper with a custom rate limit
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "client": "client_name",
+        "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+        "temperature": 0.0,
+        "rate_limit": {
+            "requests_per_second": 1
+        },
+    }
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me all the projects with their description",
+    # also accepts a string with the already downloaded HTML code
+    source="https://perinim.github.io/projects/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
@@ -1,12 +1,9 @@
 """ 
 Basic example of scraping pipeline using SmartScraper
 """
-
-import os, json
-from dotenv import load_dotenv
+import json
 from scrapegraphai.graphs import SmartScraperMultiGraph
 
-load_dotenv()
 
 # ************************************************
 # Define the configuration for the graph
 
@@ -0,0 +1,35 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+import json
+from scrapegraphai.graphs import SmartScraperMultiConcatGraph
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "client": "client_name",
+        "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+        "temperature": 0.0
+    }
+}
+
+
+# *******************************************************
+# Create the SmartScraperMultiGraph instance and run it
+# *******************************************************
+
+multiple_search_graph = SmartScraperMultiConcatGraph(
+    prompt="Who is Marco Perini?",
+    source= [
+        "https://perinim.github.io/",
+        "https://perinim.github.io/cv/"
+        ],
+    schema=None,
+    config=graph_config
+)
+
+result = multiple_search_graph.run()
+print(json.dumps(result, indent=4))
@@ -24,7 +24,7 @@
 graph_config = {
     "llm": {
         "api_key": openai_key,
-        "model": "gpt-3.5-turbo",
+        "model": "openai/gpt-3.5-turbo",
     },
     "library": "beautifoulsoup"
 }
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ OpenAI models`
`22`	`22`	`graph_config = {`
`23`	`23`	`"llm": {`
`24`	`24`	`"api_key": openai_key,`
`25`		`- "model": "gpt-3.5-turbo",`
	`25`	`+ "model": "openai/gpt-3.5-turbo",`
`26`	`26`	`},`
`27`	`27`	`}`
`28`	`28`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@`
`24`	`24`	`graph_config = {`
`25`	`25`	`"llm": {`
`26`	`26`	`"api_key": openai_key,`
`27`		`- "model": "gpt-3.5-turbo",`
	`27`	`+ "model": "openai/gpt-3.5-turbo",`
`28`	`28`	`},`
`29`	`29`	`"library": "beautifoulsoup"`
`30`	`30`	`}`